library(readxl)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(skimr)

## Warning: package 'skimr' was built under R version 4.5.3

library(janitor)

## Warning: package 'janitor' was built under R version 4.5.3

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(knitr)
library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.5.3

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(ggcorrplot)

## Warning: package 'ggcorrplot' was built under R version 4.5.3

library(factoextra)

## Warning: package 'factoextra' was built under R version 4.5.3

## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/

library(cluster)

## Warning: package 'cluster' was built under R version 4.5.3

library(dbscan)

## Warning: package 'dbscan' was built under R version 4.5.3

## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram

library(fpc)

## Warning: package 'fpc' was built under R version 4.5.3

## 
## Attaching package: 'fpc'
## 
## The following object is masked from 'package:dbscan':
## 
##     dbscan

library(LPCM)

## Warning: package 'LPCM' was built under R version 4.5.3

## 
## Attaching package: 'LPCM'
## 
## The following object is masked from 'package:lubridate':
## 
##     ms

library(dplyr)
library(e1071)

## Warning: package 'e1071' was built under R version 4.5.3

## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:ggplot2':
## 
##     element

Tahap 1: Load Data & Inspeksi Awal

1.1 Load Dataset

df <- read_excel("Dry_Bean_Dataset.xlsx")

cat("Dataset berhasil dimuat\n")

## Dataset berhasil dimuat

cat("Jumlah baris  :", nrow(df), "\n")

## Jumlah baris  : 13611

cat("Jumlah kolom  :", ncol(df), "\n")

## Jumlah kolom  : 17

cat("Total sel     :", nrow(df) * ncol(df), "\n")

## Total sel     : 231387

1.2 Preview Data

df %>%
  head(10) %>%
  kable(caption = "10 Baris Pertama Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE,
                font_size = 12)

10 Baris Pertama Dataset
Area	Perimeter	MajorAxisLength	MinorAxisLength	AspectRation	Eccentricity	ConvexArea	EquivDiameter	Extent	Solidity	roundness	Compactness	ShapeFactor1	ShapeFactor2	ShapeFactor3	ShapeFactor4	Class
28395	610.291	208.1781	173.8887	1.197191	0.5498122	28715	190.1411	0.7639225	0.9888560	0.9580271	0.9133578	0.0073315	0.0031473	0.8342224	0.9987239	SEKER
28734	638.018	200.5248	182.7344	1.097357	0.4117853	29172	191.2728	0.7839681	0.9849856	0.8870336	0.9538608	0.0069787	0.0035636	0.9098505	0.9984303	SEKER
29380	624.110	212.8261	175.9311	1.209713	0.5627273	29690	193.4109	0.7781132	0.9895588	0.9478495	0.9087742	0.0072439	0.0030477	0.8258706	0.9990661	SEKER
30008	645.884	210.5580	182.5165	1.153638	0.4986160	30724	195.4671	0.7826813	0.9766957	0.9039364	0.9283288	0.0070167	0.0032146	0.8617944	0.9941988	SEKER
30140	620.134	201.8479	190.2793	1.060798	0.3336797	30417	195.8965	0.7730980	0.9908933	0.9848771	0.9705155	0.0066970	0.0036650	0.9419004	0.9991661	SEKER
30279	634.927	212.5606	181.5102	1.171067	0.5204007	30600	196.3477	0.7756885	0.9895098	0.9438518	0.9237260	0.0070201	0.0031528	0.8532696	0.9992358	SEKER
30477	670.033	211.0502	184.0391	1.146768	0.4894779	30970	196.9886	0.7624015	0.9840814	0.8530799	0.9333736	0.0069249	0.0032420	0.8711862	0.9990487	SEKER
30519	629.727	212.9968	182.7372	1.165590	0.5137596	30847	197.1243	0.7706818	0.9893669	0.9671092	0.9254804	0.0069792	0.0031583	0.8565140	0.9983446	SEKER
30685	635.681	213.5341	183.1571	1.165852	0.5140809	31044	197.6597	0.7715615	0.9884358	0.9542398	0.9256585	0.0069589	0.0031515	0.8568437	0.9989530	SEKER
30834	631.934	217.2278	180.8975	1.200834	0.5536422	31120	198.1390	0.7836828	0.9908098	0.9702782	0.9121254	0.0070451	0.0030080	0.8319728	0.9990611	SEKER

1.3 Tipe Data Tiap Kolom

tipe_data <- data.frame(
  Kolom    = names(df),
  Tipe     = sapply(df, class),
  row.names = NULL
)

tipe_data %>%
  kable(caption = "Tipe Data Tiap Kolom") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Tipe Data Tiap Kolom
Kolom	Tipe
Area	numeric
Perimeter	numeric
MajorAxisLength	numeric
MinorAxisLength	numeric
AspectRation	numeric
Eccentricity	numeric
ConvexArea	numeric
EquivDiameter	numeric
Extent	numeric
Solidity	numeric
roundness	numeric
Compactness	numeric
ShapeFactor1	numeric
ShapeFactor2	numeric
ShapeFactor3	numeric
ShapeFactor4	numeric
Class	character

1.4 Statistik Deskriptif

df %>%
  select(-Class) %>%
  summary() %>%
  kable(caption = "Statistik Deskriptif Fitur Numerik") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                font_size = 11)

Statistik Deskriptif Fitur Numerik
Area	Perimeter	MajorAxisLength	MinorAxisLength	AspectRation	Eccentricity	ConvexArea	EquivDiameter	Extent	Solidity	roundness	Compactness	ShapeFactor1	ShapeFactor2	ShapeFactor3	ShapeFactor4
Min. : 20420	Min. : 524.7	Min. :183.6	Min. :122.5	Min. :1.025	Min. :0.2190	Min. : 20684	Min. :161.2	Min. :0.5553	Min. :0.9192	Min. :0.4896	Min. :0.6406	Min. :0.002778	Min. :0.0005642	Min. :0.4103	Min. :0.9477
1st Qu.: 36328	1st Qu.: 703.5	1st Qu.:253.3	1st Qu.:175.8	1st Qu.:1.432	1st Qu.:0.7159	1st Qu.: 36715	1st Qu.:215.1	1st Qu.:0.7186	1st Qu.:0.9857	1st Qu.:0.8321	1st Qu.:0.7625	1st Qu.:0.005900	1st Qu.:0.0011535	1st Qu.:0.5814	1st Qu.:0.9937
Median : 44652	Median : 794.9	Median :296.9	Median :192.4	Median :1.551	Median :0.7644	Median : 45178	Median :238.4	Median :0.7599	Median :0.9883	Median :0.8832	Median :0.8013	Median :0.006645	Median :0.0016935	Median :0.6420	Median :0.9964
Mean : 53048	Mean : 855.3	Mean :320.1	Mean :202.3	Mean :1.583	Mean :0.7509	Mean : 53768	Mean :253.1	Mean :0.7497	Mean :0.9871	Mean :0.8733	Mean :0.7999	Mean :0.006564	Mean :0.0017159	Mean :0.6436	Mean :0.9951
3rd Qu.: 61332	3rd Qu.: 977.2	3rd Qu.:376.5	3rd Qu.:217.0	3rd Qu.:1.707	3rd Qu.:0.8105	3rd Qu.: 62294	3rd Qu.:279.4	3rd Qu.:0.7869	3rd Qu.:0.9900	3rd Qu.:0.9169	3rd Qu.:0.8343	3rd Qu.:0.007271	3rd Qu.:0.0021703	3rd Qu.:0.6960	3rd Qu.:0.9979
Max. :254616	Max. :1985.4	Max. :738.9	Max. :460.2	Max. :2.430	Max. :0.9114	Max. :263261	Max. :569.4	Max. :0.8662	Max. :0.9947	Max. :0.9907	Max. :0.9873	Max. :0.010451	Max. :0.0036650	Max. :0.9748	Max. :0.9997

skim(df)

Data summary
Name	df
Number of rows	13611
Number of columns	17
_______________________
Column type frequency:
character	1
numeric	16
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
Class	0	1	4	8	0	7	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Area	1	53048.28	29324.10	20420.00	36328.00	44652.00	61332.00	254616.00	▇▂▁▁▁
Perimeter	1	855.28	214.29	524.74	703.52	794.94	977.21	1985.37	▇▆▁▁▁
MajorAxisLength	1	320.14	85.69	183.60	253.30	296.88	376.50	738.86	▇▆▂▁▁
MinorAxisLength	1	202.27	44.97	122.51	175.85	192.43	217.03	460.20	▇▇▁▁▁
AspectRation	1	1.58	0.25	1.02	1.43	1.55	1.71	2.43	▂▇▅▂▁
Eccentricity	1	0.75	0.09	0.22	0.72	0.76	0.81	0.91	▁▁▂▇▇
ConvexArea	1	53768.20	29774.92	20684.00	36714.50	45178.00	62294.00	263261.00	▇▂▁▁▁
EquivDiameter	1	253.06	59.18	161.24	215.07	238.44	279.45	569.37	▇▆▁▁▁
Extent	1	0.75	0.05	0.56	0.72	0.76	0.79	0.87	▁▁▅▇▂
Solidity	1	0.99	0.00	0.92	0.99	0.99	0.99	0.99	▁▁▁▁▇
roundness	1	0.87	0.06	0.49	0.83	0.88	0.92	0.99	▁▁▂▇▇
Compactness	1	0.80	0.06	0.64	0.76	0.80	0.83	0.99	▂▅▇▂▁
ShapeFactor1	1	0.01	0.00	0.00	0.01	0.01	0.01	0.01	▁▃▇▃▁
ShapeFactor2	1	0.00	0.00	0.00	0.00	0.00	0.00	0.00	▇▇▇▃▁
ShapeFactor3	1	0.64	0.10	0.41	0.58	0.64	0.70	0.97	▂▇▇▃▁
ShapeFactor4	1	1.00	0.00	0.95	0.99	1.00	1.00	1.00	▁▁▁▁▇

1.5 Jumlah Nilai Unik Tiap Kolom

nilai_unik <- data.frame(
  Kolom       = names(df),
  Nilai_Unik  = sapply(df, n_distinct),
  row.names   = NULL
)

nilai_unik %>%
  kable(caption = "Jumlah Nilai Unik per Kolom") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Jumlah Nilai Unik per Kolom
Kolom	Nilai_Unik
Area	12011
Perimeter	13416
MajorAxisLength	13543
MinorAxisLength	13543
AspectRation	13543
Eccentricity	13543
ConvexArea	12066
EquivDiameter	12011
Extent	13535
Solidity	13526
roundness	13543
Compactness	13543
ShapeFactor1	13543
ShapeFactor2	13543
ShapeFactor3	13543
ShapeFactor4	13543
Class	7

1.6 Distribusi Kelas

dist_kelas <- df %>%
  count(Class, name = "Jumlah") %>%
  arrange(desc(Jumlah)) %>%
  mutate(Persentase = percent(Jumlah / sum(Jumlah), accuracy = 0.1))

dist_kelas %>%
  kable(caption = "Distribusi Data per Kelas") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")

Distribusi Data per Kelas
Class	Jumlah	Persentase
DERMASON	3546	26.1%
SIRA	2636	19.4%
SEKER	2027	14.9%
HOROZ	1928	14.2%
CALI	1630	12.0%
BARBUNYA	1322	9.7%
BOMBAY	522	3.8%

warna <- c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6","#1abc9c","#e67e22")

# Bar chart
p1 <- ggplot(dist_kelas, aes(x = reorder(Class, -Jumlah),
                             y = Jumlah, fill = Class)) +
  geom_col(width = 0.7, show.legend = FALSE) +
  geom_text(aes(label = paste0(Jumlah, "\n", Persentase)),
            vjust = -0.3, size = 3.5, fontface = "bold") +
  scale_fill_manual(values = warna) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(title = "Distribusi Jumlah Data per Kelas",
       x = "Kelas", y = "Jumlah") +
  theme_minimal(base_size = 13) +
  theme(plot.title   = element_text(face = "bold", hjust = 0.5),
        axis.text.x  = element_text(face = "bold"))

# Pie chart
p2 <- ggplot(dist_kelas, aes(x = "", y = Jumlah, fill = Class)) +
  geom_col(width = 1, color = "white", linewidth = 0.5) +
  coord_polar(theta = "y") +
  geom_text(aes(label = paste0(Class, "\n", Persentase)),
            position = position_stack(vjust = 0.5),
            size = 3.5, fontface = "bold", color = "white") +
  scale_fill_manual(values = warna) +
  labs(title = "Proporsi Kelas") +
  theme_void(base_size = 13) +
  theme(plot.title  = element_text(face = "bold", hjust = 0.5),
        legend.position = "none")

gridExtra::grid.arrange(p1, p2, ncol = 2)

Tahap 2: Cek & Tangani Missing Values

2.1 Cek Missing Values

missing_df <- data.frame(
  Kolom          = names(df),
  Missing_Count  = sapply(df, function(x) sum(is.na(x))),
  Missing_Persen = sapply(df, function(x) 
    percent(mean(is.na(x)), accuracy = 0.01)),
  row.names = NULL
)

missing_df %>%
  kable(caption = "Ringkasan Missing Values per Kolom") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which(missing_df$Missing_Count > 0),
           background = "#fde8e8", color = "#c0392b")

Ringkasan Missing Values per Kolom
Kolom	Missing_Count	Missing_Persen
Area	0	0.00%
Perimeter	0	0.00%
MajorAxisLength	0	0.00%
MinorAxisLength	0	0.00%
AspectRation	0	0.00%
Eccentricity	0	0.00%
ConvexArea	0	0.00%
EquivDiameter	0	0.00%
Extent	0	0.00%
Solidity	0	0.00%
roundness	0	0.00%
Compactness	0	0.00%
ShapeFactor1	0	0.00%
ShapeFactor2	0	0.00%
ShapeFactor3	0	0.00%
ShapeFactor4	0	0.00%
Class	0	0.00%

cat("\nTotal missing values:", sum(is.na(df)), "\n")

## 
## Total missing values: 0

2.2 Cek Data Duplikat

n_duplikat <- sum(duplicated(df))
cat("Jumlah baris duplikat:", n_duplikat, "\n")

## Jumlah baris duplikat: 68

if (n_duplikat > 0) {
  df <- df %>% distinct()
  cat("Duplikat dihapus. Shape baru:", nrow(df), "x", ncol(df), "\n")
} else {
  cat("Tidak ada duplikat\n")
}

## Duplikat dihapus. Shape baru: 13543 x 17

2.3 Kesimpulan Kualitas Data

kesimpulan <- data.frame(
  Aspek      = c("Shape awal", "Total missing values",
                 "Data duplikat", "Jumlah kelas", "Fitur numerik"),
  Hasil      = c(
    paste0(nrow(df), " baris × ", ncol(df), " kolom"),
    paste0(sum(is.na(df)), " (tidak ada)"),
    paste0(n_duplikat,     " (tidak ada)"),
    paste0(n_distinct(df$Class), " kelas"),
    paste0(ncol(select(df, -Class)), " kolom")
  ),
  Status     = c("oke", "oke", "oke", "oke", "oke")
)

kesimpulan %>%
  kable(caption = "Kesimpulan Kualitas Data") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  column_spec(3, color = "green", bold = TRUE)

Kesimpulan Kualitas Data
Aspek	Hasil	Status
Shape awal	13543 baris × 17 kolom	oke
Total missing values	0 (tidak ada)	oke
Data duplikat	68 (tidak ada)	oke
Jumlah kelas	7 kelas	oke
Fitur numerik	16 kolom	oke

#3. Deteksi & Tangani Outlier ##3.1 Setup: Pisahkan Fitur Numerik

df_numerik <- df %>% select(-Class)
fitur_numerik <- names(df_numerik)

cat("Jumlah fitur numerik:", length(fitur_numerik), "\n")

## Jumlah fitur numerik: 16

cat("Fitur:", paste(fitur_numerik, collapse = ", "), "\n")

## Fitur: Area, Perimeter, MajorAxisLength, MinorAxisLength, AspectRation, Eccentricity, ConvexArea, EquivDiameter, Extent, Solidity, roundness, Compactness, ShapeFactor1, ShapeFactor2, ShapeFactor3, ShapeFactor4

3.2 Deteksi Outlier: Metode IQR (Boxplot Rule)

iqr_summary <- df_numerik %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  group_by(Fitur) %>%
  summarise(
    Q1          = quantile(Nilai, 0.25),
    Q3          = quantile(Nilai, 0.75),
    IQR         = Q3 - Q1,
    Lower_Bound = Q1 - 1.5 * IQR,
    Upper_Bound = Q3 + 1.5 * IQR,
    N_Outlier   = sum(Nilai < (Q1 - 1.5 * IQR) | Nilai > (Q3 + 1.5 * IQR)),
    Pct_Outlier = round(N_Outlier / n() * 100, 2),
    .groups = "drop"
  ) %>%
  arrange(desc(N_Outlier))

iqr_summary %>%
  kable(
    caption = "Deteksi Outlier per Fitur (IQR)",
    digits  = 4,
    col.names = c("Fitur", "Q1", "Q3", "IQR",
                  "Batas Bawah", "Batas Atas",
                  "N Outlier", "% Outlier")
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    font_size  = 12
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(
    which(iqr_summary$Pct_Outlier >= 5),
    background = "#fde8e8", color = "#c0392b"
  )

Deteksi Outlier per Fitur (IQR)
Fitur	Q1	Q3	IQR	Batas Bawah	Batas Atas	N Outlier	% Outlier
Eccentricity	0.7151	0.8097	0.0945	0.5734	0.9515	833	6.15
Solidity	0.9857	0.9900	0.0043	0.9792	0.9965	774	5.72
ShapeFactor4	0.9937	0.9979	0.0042	0.9875	1.0041	760	5.61
MinorAxisLength	175.8864	217.2454	41.3590	113.8478	279.2840	567	4.19
Area	36282.5000	61382.0000	25099.5000	-1366.7500	99031.2500	551	4.07
ConvexArea	36673.0000	62360.0000	25687.0000	-1857.5000	100890.5000	549	4.05
ShapeFactor1	0.0059	0.0073	0.0014	0.0038	0.0093	533	3.94
EquivDiameter	214.9333	279.5604	64.6271	117.9927	376.5010	526	3.88
Perimeter	703.2300	977.1465	273.9165	292.3553	1388.0212	500	3.69
AspectRation	1.4307	1.7039	0.2733	1.0208	2.1138	485	3.58
MajorAxisLength	253.0868	376.3125	123.2257	68.2483	561.1510	379	2.80
Extent	0.7187	0.7868	0.0681	0.6166	0.8890	271	2.00
ShapeFactor3	0.5825	0.6963	0.1138	0.4118	0.8671	202	1.49
Compactness	0.7632	0.8345	0.0712	0.6564	0.9413	124	0.92
roundness	0.8334	0.9170	0.0836	0.7080	1.0425	98	0.72
ShapeFactor2	0.0012	0.0022	0.0010	-0.0004	0.0037	0	0.00

3.3 Deteksi Outlier: Metode Z-Score

zscore_summary <- df_numerik %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  group_by(Fitur) %>%
  summarise(
    N_Outlier_Z   = sum(abs(scale(Nilai)) > 3),
    Pct_Outlier_Z = round(N_Outlier_Z / n() * 100, 2),
    .groups = "drop"
  ) %>%
  arrange(desc(N_Outlier_Z))

zscore_summary %>%
  kable(
    caption   = "Deteksi Outlier per Fitur (Z-score)",
    col.names = c("Fitur", "N Outlier", "% Outlier")
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    font_size  = 12
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")

Deteksi Outlier per Fitur (Z-score)
Fitur	N Outlier	% Outlier
MinorAxisLength	506	3.74
Area	483	3.57
ConvexArea	482	3.56
EquivDiameter	465	3.43
Perimeter	404	2.98
MajorAxisLength	316	2.33
ShapeFactor4	239	1.76
Solidity	237	1.75
Extent	139	1.03
Eccentricity	125	0.92
roundness	75	0.55
ShapeFactor1	53	0.39
AspectRation	21	0.16
ShapeFactor3	8	0.06
ShapeFactor2	5	0.04
Compactness	1	0.01

3.4 Ringkasan Level Baris

outlier_mask <- df_numerik

for (col in fitur_numerik) {
  q1  <- quantile(df_numerik[[col]], 0.25)
  q3  <- quantile(df_numerik[[col]], 0.75)
  iqr <- q3 - q1
  outlier_mask[[col]] <- df_numerik[[col]] < (q1 - 1.5 * iqr) |
    df_numerik[[col]] > (q3 + 1.5 * iqr)
}

baris_outlier <- rowSums(outlier_mask) > 0
n_baris_outlier <- sum(baris_outlier)
n_baris_bersih  <- nrow(df) - n_baris_outlier

ringkasan_baris <- data.frame(
  Kondisi    = c("Baris dengan outlier (≥1 fitur)", "Baris bersih", "Total"),
  Jumlah     = c(n_baris_outlier, n_baris_bersih, nrow(df)),
  Persentase = percent(
    c(n_baris_outlier, n_baris_bersih, nrow(df)) / nrow(df),
    accuracy = 0.01
  )
)

ringkasan_baris %>%
  kable(caption = "Ringkasan Outlier Level Baris (IQR)") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(1, background = "#fde8e8", color = "#c0392b") %>%
  row_spec(2, background = "#eafaf1", color = "#1e8449")

Ringkasan Outlier Level Baris (IQR)
Kondisi	Jumlah	Persentase
Baris dengan outlier (≥1 fitur)	3004	22.18%
Baris bersih	10539	77.82%
Total	13543	100.00%

3.5 Visualisasi Boxplot per Fitur

warna_box <- c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6",
               "#1abc9c","#e67e22","#34495e","#16a085","#8e44ad",
               "#2980b9","#c0392b","#27ae60","#d35400","#7f8c8d","#2c3e50")

df_long <- df_numerik %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  mutate(Fitur = factor(Fitur, levels = fitur_numerik))

ggplot(df_long, aes(x = Fitur, y = Nilai, fill = Fitur)) +
  geom_boxplot(
    outlier.shape  = 16,
    outlier.size   = 0.8,
    outlier.alpha  = 0.4,
    outlier.colour = "#e74c3c",
    show.legend    = FALSE
  ) +
  scale_fill_manual(values = warna_box) +
  facet_wrap(~Fitur, scales = "free", ncol = 4) +
  labs(
    title    = "Boxplot Tiap Fitur",
    subtitle = "Skala bebas per fitur (free scale)",
    x = NULL, y = "Nilai"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 14),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    axis.text.x   = element_blank(),
    axis.ticks.x  = element_blank(),
    strip.text    = element_text(face = "bold", size = 10)
  )

3.6 Visualisasi: Bar Chart % Outlier per Fitur

iqr_plot <- iqr_summary %>%
  mutate(
    Fitur = reorder(Fitur, Pct_Outlier),
    Warna = ifelse(Pct_Outlier >= 5, "#e74c3c", "#3498db")
  )

ggplot(iqr_plot, aes(x = Fitur, y = Pct_Outlier, fill = Warna)) +
  geom_col(width = 0.7, show.legend = FALSE) +
  geom_hline(yintercept = 5, linetype = "dashed",
             color = "#f39c12", linewidth = 1) +
  geom_text(aes(label = paste0(Pct_Outlier, "%")),
            hjust = -0.15, size = 3.5, fontface = "bold") +
  scale_fill_identity() +
  scale_y_continuous(
    expand = expansion(mult = c(0, 0.2)),
    labels = function(x) paste0(x, "%")
  ) +
  coord_flip() +
  annotate("text", x = 1, y = 5.3,
           label = "Batas 5%", color = "#f39c12",
           size = 3.5, fontface = "italic") +
  labs(
    title    = "Persentase Outlier per Fitur (Metode IQR)",
    subtitle = "Merah = outlier ≥ 5% | Biru = outlier < 5%",
    x = "Fitur", y = "% Outlier"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    panel.grid.major.y = element_blank()
  )

3.7 Penanganan Outlier: Winsorizing (Capping IQR)

df_clean <- df  

for (col in fitur_numerik) {
  q1  <- quantile(df_clean[[col]], 0.25)
  q3  <- quantile(df_clean[[col]], 0.75)
  iqr <- q3 - q1
  lower <- q1 - 1.5 * iqr
  upper <- q3 + 1.5 * iqr
  
  df_clean[[col]] <- pmin(pmax(df_clean[[col]], lower), upper)
}

cat("Winsorizing selesai!\n")

## Winsorizing selesai!

cat("Shape df_clean:", nrow(df_clean), "x", ncol(df_clean), "\n")

## Shape df_clean: 13543 x 17

cat("Kolom Class tetap ada:", "Class" %in% names(df_clean), "\n")

## Kolom Class tetap ada: TRUE

outlier_setelah <- df_clean %>%
  select(-Class) %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  group_by(Fitur) %>%
  summarise(
    N_Outlier   = sum(Nilai < (quantile(Nilai, 0.25) - 1.5 * IQR(Nilai)) |
                        Nilai > (quantile(Nilai, 0.75) + 1.5 * IQR(Nilai))),
    Pct_Outlier = round(N_Outlier / n() * 100, 2),
    .groups = "drop"
  )

outlier_setelah %>%
  kable(
    caption   = "Verifikasi Outlier Setelah Winsorizing",
    col.names = c("Fitur", "N Outlier", "% Outlier")
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    font_size  = 12
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(
    which(outlier_setelah$N_Outlier == 0),
    background = "#eafaf1", color = "#1e8449"
  )

Verifikasi Outlier Setelah Winsorizing
Fitur	N Outlier	% Outlier
Area	0	0
AspectRation	0	0
Compactness	0	0
ConvexArea	0	0
Eccentricity	0	0
EquivDiameter	0	0
Extent	0	0
MajorAxisLength	0	0
MinorAxisLength	0	0
Perimeter	0	0
ShapeFactor1	0	0
ShapeFactor2	0	0
ShapeFactor3	0	0
ShapeFactor4	0	0
Solidity	0	0
roundness	0	0

3.8 Kesimpulan Tahap 3

fitur_tinggi <- iqr_summary %>%
  filter(Pct_Outlier >= 5) %>%
  pull(Fitur)

kesimpulan_3 <- data.frame(
  Aspek  = c(
    "Metode deteksi",
    "Fitur dengan outlier tertinggi",
    "Baris terdeteksi outlier (IQR)",
    "Metode penanganan",
    "Shape data setelah penanganan",
    "Status"
  ),
  Hasil  = c(
    "IQR (1.5×IQR) & Z-Score (|z| > 3)",
    paste(fitur_tinggi, collapse = ", "),
    paste0(n_baris_outlier, " baris (", round(n_baris_outlier/nrow(df)*100,2), "%)"),
    "Winsorizing / Capping ke batas IQR",
    paste0(nrow(df_clean), " baris × ", ncol(df_clean), " kolom"),
    "Selesai — lanjut ke Tahap 4 EDA"
  ),
  Status = c("info", "perlu cek", "perlu cek", "wajib", "oke", "oke")
)

kesimpulan_3 %>%
  kable(caption = "Kesimpulan Tahap 3: Deteksi & Penanganan Outlier") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(which(kesimpulan_3$Status == "oke"),
           background = "#eafaf1", color = "#1e8449") %>%
  row_spec(which(kesimpulan_3$Status == "perlu cek"),
           background = "#fef9e7", color = "#b7770d") %>%
  column_spec(3, bold = TRUE)

Kesimpulan Tahap 3: Deteksi & Penanganan Outlier
Aspek	Hasil	Status
Metode deteksi	IQR (1.5×IQR) & Z-Score (\|z\| > 3)	info
Fitur dengan outlier tertinggi	Eccentricity, Solidity, ShapeFactor4	perlu cek
Baris terdeteksi outlier (IQR)	3004 baris (22.18%)	perlu cek
Metode penanganan	Winsorizing / Capping ke batas IQR	wajib
Shape data setelah penanganan	13543 baris × 17 kolom	oke
Status	Selesai — lanjut ke Tahap 4 EDA	oke

Tahap 4: Exploratory Data Analysis (EDA)

4.1 Setup: Siapkan Data Numerik Bersih

df_numerik_clean <- df_clean %>% select(-Class)
fitur_numerik    <- names(df_numerik_clean)

cat("Jumlah fitur numerik:", length(fitur_numerik), "\n")

## Jumlah fitur numerik: 16

cat("Fitur:", paste(fitur_numerik, collapse = ", "), "\n")

## Fitur: Area, Perimeter, MajorAxisLength, MinorAxisLength, AspectRation, Eccentricity, ConvexArea, EquivDiameter, Extent, Solidity, roundness, Compactness, ShapeFactor1, ShapeFactor2, ShapeFactor3, ShapeFactor4

4.2 Statistik Deskriptif (Setelah Winsorizing)

stat_desc <- df_numerik_clean %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  group_by(Fitur) %>%
  summarise(
    Min    = round(min(Nilai), 2),
    Q1     = round(quantile(Nilai, 0.25), 2),
    Median = round(median(Nilai), 2),
    Mean   = round(mean(Nilai), 2),
    Q3     = round(quantile(Nilai, 0.75), 2),
    Max    = round(max(Nilai), 2),
    SD     = round(sd(Nilai), 2),
    .groups = "drop"
  )

stat_desc %>%
  kable(caption = "Statistik Deskriptif Fitur Numerik (Setelah Winsorizing)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")

Statistik Deskriptif Fitur Numerik (Setelah Winsorizing)
Fitur	Min	Q1	Median	Mean	Q3	Max	SD
Area	20420.00	36282.50	44580.00	50166.95	61382.00	99031.25	18864.23
AspectRation	1.02	1.43	1.55	1.58	1.70	2.11	0.24
Compactness	0.66	0.76	0.80	0.80	0.83	0.94	0.06
ConvexArea	20684.00	36673.00	45122.00	50869.02	62360.00	100890.50	19253.93
Eccentricity	0.57	0.72	0.76	0.75	0.81	0.91	0.08
EquivDiameter	161.24	214.93	238.25	249.47	279.56	376.50	47.51
Extent	0.62	0.72	0.76	0.75	0.79	0.87	0.05
MajorAxisLength	183.60	253.09	296.40	318.31	376.31	561.15	80.41
MinorAxisLength	122.51	175.89	192.49	198.67	217.25	279.28	32.83
Perimeter	524.74	703.23	793.90	847.33	977.15	1388.02	189.36
ShapeFactor1	0.00	0.01	0.01	0.01	0.01	0.01	0.00
ShapeFactor2	0.00	0.00	0.00	0.00	0.00	0.00	0.00
ShapeFactor3	0.41	0.58	0.64	0.64	0.70	0.87	0.10
ShapeFactor4	0.99	0.99	1.00	1.00	1.00	1.00	0.00
Solidity	0.98	0.99	0.99	0.99	0.99	0.99	0.00
roundness	0.71	0.83	0.88	0.87	0.92	0.99	0.06

4.3 Distribusi Tiap Fitur (Histogram)

warna_hist <- c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6",
                "#1abc9c","#e67e22","#34495e","#16a085","#8e44ad",
                "#2980b9","#c0392b","#27ae60","#d35400","#7f8c8d","#2c3e50")

df_long_clean <- df_numerik_clean %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  mutate(Fitur = factor(Fitur, levels = fitur_numerik))

ggplot(df_long_clean, aes(x = Nilai, fill = Fitur)) +
  geom_histogram(bins = 40, color = "white", alpha = 0.85, show.legend = FALSE) +
  geom_freqpoly(bins = 40, color = "black", linewidth = 0.5, show.legend = FALSE) +
  scale_fill_manual(values = warna_hist) +
  facet_wrap(~Fitur, scales = "free", ncol = 4) +
  labs(
    title    = "Distribusi Tiap Fitur Numerik (Setelah Winsorizing)",
    subtitle = "Skala bebas per fitur",
    x = "Nilai", y = "Frekuensi"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 14),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    strip.text    = element_text(face = "bold", size = 10)
  )

4.4 Distribusi Fitur per Kelas

df_long_kelas <- df_clean %>%
  pivot_longer(-Class, names_to = "Fitur", values_to = "Nilai") %>%
  mutate(Fitur = factor(Fitur, levels = fitur_numerik))

warna_kelas <- c("#2ecc71","#3498db","#e74c3c","#f39c12",
                 "#9b59b6","#1abc9c","#e67e22")

ggplot(df_long_kelas, aes(x = Class, y = Nilai, fill = Class)) +
  geom_violin(trim = TRUE, alpha = 0.7, show.legend = FALSE) +
  geom_boxplot(width = 0.15, fill = "white",
               outlier.shape = NA, show.legend = FALSE) +
  scale_fill_manual(values = warna_kelas) +
  facet_wrap(~Fitur, scales = "free_y", ncol = 4) +
  labs(
    title    = "Distribusi Fitur per Kelas (Violin dan Boxplot)",
    subtitle = "Distribusi nilai fitur untuk tiap kelas kacang",
    x = "Kelas", y = "Nilai"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    strip.text    = element_text(face = "bold", size = 9),
    axis.text.x   = element_text(angle = 30, hjust = 1, size = 8)
  )

4.5 Matriks Korelasi Antar Fitur

cor_matrix <- cor(df_numerik_clean, method = "pearson")

cor_matrix %>%
  round(2) %>%
  kable(caption = "Matriks Korelasi Pearson Antar Fitur") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE, font_size = 10) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")

Matriks Korelasi Pearson Antar Fitur
	Area	Perimeter	MajorAxisLength	MinorAxisLength	AspectRation	Eccentricity	ConvexArea	EquivDiameter	Extent	Solidity	roundness	Compactness	ShapeFactor1	ShapeFactor2	ShapeFactor3	ShapeFactor4
Area	1.00	0.99	0.95	0.92	0.37	0.39	1.00	1.00	0.00	-0.32	-0.54	-0.39	-0.89	-0.77	-0.39	-0.50
Perimeter	0.99	1.00	0.97	0.88	0.44	0.44	0.99	0.99	-0.04	-0.37	-0.62	-0.45	-0.86	-0.81	-0.45	-0.52
MajorAxisLength	0.95	0.97	1.00	0.77	0.59	0.58	0.95	0.96	-0.09	-0.33	-0.64	-0.60	-0.76	-0.88	-0.60	-0.57
MinorAxisLength	0.92	0.88	0.77	1.00	0.00	0.02	0.92	0.92	0.14	-0.23	-0.28	-0.01	-0.99	-0.49	-0.02	-0.33
AspectRation	0.37	0.44	0.59	0.00	1.00	0.96	0.37	0.38	-0.36	-0.30	-0.78	-0.99	0.01	-0.84	-0.98	-0.52
Eccentricity	0.39	0.44	0.58	0.02	0.96	1.00	0.39	0.39	-0.33	-0.34	-0.75	-0.98	0.01	-0.87	-0.99	-0.54
ConvexArea	1.00	0.99	0.95	0.92	0.37	0.39	1.00	1.00	0.00	-0.33	-0.54	-0.39	-0.89	-0.77	-0.40	-0.51
EquivDiameter	1.00	0.99	0.96	0.92	0.38	0.39	1.00	1.00	0.00	-0.31	-0.54	-0.40	-0.90	-0.78	-0.40	-0.50
Extent	0.00	-0.04	-0.09	0.14	-0.36	-0.33	0.00	0.00	1.00	0.21	0.34	0.35	-0.14	0.23	0.34	0.15
Solidity	-0.32	-0.37	-0.33	-0.23	-0.30	-0.34	-0.33	-0.31	0.21	1.00	0.65	0.33	0.18	0.38	0.34	0.60
roundness	-0.54	-0.62	-0.64	-0.28	-0.78	-0.75	-0.54	-0.54	0.34	0.65	1.00	0.78	0.25	0.79	0.77	0.53
Compactness	-0.39	-0.45	-0.60	-0.01	-0.99	-0.98	-0.39	-0.40	0.35	0.33	0.78	1.00	-0.01	0.87	1.00	0.55
ShapeFactor1	-0.89	-0.86	-0.76	-0.99	0.01	0.01	-0.89	-0.90	-0.14	0.18	0.25	-0.01	1.00	0.47	-0.01	0.30
ShapeFactor2	-0.77	-0.81	-0.88	-0.49	-0.84	-0.87	-0.77	-0.78	0.23	0.38	0.79	0.87	0.47	1.00	0.87	0.61
ShapeFactor3	-0.39	-0.45	-0.60	-0.02	-0.98	-0.99	-0.40	-0.40	0.34	0.34	0.77	1.00	-0.01	0.87	1.00	0.55
ShapeFactor4	-0.50	-0.52	-0.57	-0.33	-0.52	-0.54	-0.51	-0.50	0.15	0.60	0.53	0.55	0.30	0.61	0.55	1.00

4.6 Visualisasi Heatmap Korelasi

ggcorrplot(
  cor_matrix,
  method     = "square",
  type       = "lower",
  lab        = TRUE,
  lab_size   = 3,
  colors     = c("#e74c3c", "white", "#2ecc71"),
  title      = "Heatmap Korelasi Antar Fitur (Pearson)",
  ggtheme    = theme_minimal(base_size = 11),
  tl.cex     = 10,
  outline.col = "white"
) +
  theme(
    plot.title      = element_text(face = "bold", hjust = 0.5, size = 13),
    legend.position = "right"
  )

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
##   Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

4.7 Identifikasi Pasangan Fitur Berkorelasi Tinggi

cor_df <- as.data.frame(as.table(cor_matrix)) %>%
  rename(Fitur_1 = Var1, Fitur_2 = Var2, Korelasi = Freq) %>%
  filter(as.character(Fitur_1) < as.character(Fitur_2)) %>%
  mutate(Korelasi = round(Korelasi, 4),
         Abs_Kor  = abs(Korelasi)) %>%
  arrange(desc(Abs_Kor))

korelasi_tinggi <- cor_df %>% filter(Abs_Kor >= 0.8)

korelasi_tinggi %>%
  select(Fitur_1, Fitur_2, Korelasi) %>%
  kable(caption = "Pasangan Fitur dengan Korelasi Tinggi (|r| ≥ 0.80)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(which(abs(korelasi_tinggi$Korelasi) >= 0.95),
           background = "#fde8e8", color = "#c0392b") %>%
  row_spec(which(abs(korelasi_tinggi$Korelasi) >= 0.80 &
                   abs(korelasi_tinggi$Korelasi) < 0.95),
           background = "#fef9e7", color = "#b7770d")

Pasangan Fitur dengan Korelasi Tinggi (|r| ≥ 0.80)
Fitur_1	Fitur_2	Korelasi
Area	ConvexArea	0.9999
Compactness	ShapeFactor3	0.9989
Area	EquivDiameter	0.9972
ConvexArea	EquivDiameter	0.9970
AspectRation	Compactness	-0.9909
Eccentricity	ShapeFactor3	-0.9907
EquivDiameter	Perimeter	0.9905
MinorAxisLength	ShapeFactor1	-0.9877
ConvexArea	Perimeter	0.9874
Area	Perimeter	0.9866
AspectRation	ShapeFactor3	-0.9842
Compactness	Eccentricity	-0.9840
MajorAxisLength	Perimeter	0.9746
EquivDiameter	MajorAxisLength	0.9589
AspectRation	Eccentricity	0.9560
ConvexArea	MajorAxisLength	0.9529
Area	MajorAxisLength	0.9526
EquivDiameter	MinorAxisLength	0.9175
Area	MinorAxisLength	0.9172
ConvexArea	MinorAxisLength	0.9168
EquivDiameter	ShapeFactor1	-0.9013
Area	ShapeFactor1	-0.8913
ConvexArea	ShapeFactor1	-0.8904
MajorAxisLength	ShapeFactor2	-0.8844
MinorAxisLength	Perimeter	0.8784
ShapeFactor2	ShapeFactor3	0.8715
Compactness	ShapeFactor2	0.8679
Eccentricity	ShapeFactor2	-0.8662
Perimeter	ShapeFactor1	-0.8602
AspectRation	ShapeFactor2	-0.8444
Perimeter	ShapeFactor2	-0.8079

cat("\nTotal pasangan berkorelasi tinggi (|r| ≥ 0.80):", nrow(korelasi_tinggi), "\n")

## 
## Total pasangan berkorelasi tinggi (|r| ≥ 0.80): 31

4.8 Kesimpulan EDA

n_kor_tinggi <- nrow(korelasi_tinggi)
n_kor_sangat_tinggi <- sum(abs(korelasi_tinggi$Korelasi) >= 0.95)

kesimpulan_4 <- data.frame(
  Aspek  = c(
    "Jumlah fitur yang dianalisis",
    "Pasangan korelasi tinggi (|r| ≥ 0.80)",
    "Pasangan korelasi sangat tinggi (|r| ≥ 0.95)",
    "Fitur terkorelasi tertinggi",
    "Rekomendasi"
  ),
  Hasil  = c(
    paste0(length(fitur_numerik), " fitur numerik"),
    paste0(n_kor_tinggi, " pasangan"),
    paste0(n_kor_sangat_tinggi, " pasangan"),
    paste0(cor_df$Fitur_1[1], " — ", cor_df$Fitur_2[1],
           " (r = ", cor_df$Korelasi[1], ")"),
    "Pertimbangkan PCA atau drop fitur redundan"
  ),
  Status = c("oke", "perlu cek", "wajib", "info", "wajib")
)

kesimpulan_4 %>%
  kable(caption = "Kesimpulan Tahap 4: EDA") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(which(kesimpulan_4$Status == "oke"),
           background = "#eafaf1", color = "#1e8449") %>%
  row_spec(which(kesimpulan_4$Status == "perlu cek"),
           background = "#fef9e7", color = "#b7770d") %>%
  row_spec(which(kesimpulan_4$Status == "wajib"),
           background = "#fde8e8", color = "#c0392b") %>%
  column_spec(3, bold = TRUE)

Kesimpulan Tahap 4: EDA
Aspek	Hasil	Status
Jumlah fitur yang dianalisis	16 fitur numerik	oke
Pasangan korelasi tinggi (\|r\| ≥ 0.80)	31 pasangan	perlu cek
Pasangan korelasi sangat tinggi (\|r\| ≥ 0.95)	17 pasangan	wajib
Fitur terkorelasi tertinggi	Area — ConvexArea (r = 0.9999)	info
Rekomendasi	Pertimbangkan PCA atau drop fitur redundan	wajib

Tahap 5: Feature Selection / Reduksi Dimensi (Opsional)

5.1 Strategi: Drop Fitur Redundan

fitur_drop <- c()

for (i in seq_len(nrow(korelasi_tinggi))) {
  f1 <- as.character(korelasi_tinggi$Fitur_1[i])
  f2 <- as.character(korelasi_tinggi$Fitur_2[i])
  if (!(f1 %in% fitur_drop) && !(f2 %in% fitur_drop)) {
    fitur_drop <- c(fitur_drop, f2)
  }
}

fitur_terpilih <- setdiff(fitur_numerik, fitur_drop)

cat("Fitur yang redundan:\n")

## Fitur yang redundan:

cat(paste(" -", fitur_drop, collapse = "\n"), "\n\n")

##  - ConvexArea
##  - ShapeFactor3
##  - EquivDiameter
##  - Compactness
##  - ShapeFactor1
##  - Perimeter
##  - Eccentricity
##  - MajorAxisLength
##  - MinorAxisLength
##  - ShapeFactor2

cat("Fitur yang dipertahankan:\n")

## Fitur yang dipertahankan:

cat(paste(" +", fitur_terpilih, collapse = "\n"), "\n\n")

##  + Area
##  + AspectRation
##  + Extent
##  + Solidity
##  + roundness
##  + ShapeFactor4

cat("Jumlah fitur sebelum seleksi:", length(fitur_numerik), "\n")

## Jumlah fitur sebelum seleksi: 16

cat("Jumlah fitur setelah seleksi:", length(fitur_terpilih), "\n")

## Jumlah fitur setelah seleksi: 6

tabel_seleksi <- data.frame(
  Fitur  = fitur_numerik,
  Status = ifelse(fitur_numerik %in% fitur_drop, "Drop (Redundan)", "Dipertahankan")
)

tabel_seleksi %>%
  kable(caption = "Status Fitur Setelah Feature Selection") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(which(tabel_seleksi$Status == "Drop (Redundan)"),
           background = "#fde8e8", color = "#c0392b") %>%
  row_spec(which(tabel_seleksi$Status == "Dipertahankan"),
           background = "#eafaf1", color = "#1e8449")

Status Fitur Setelah Feature Selection
Fitur	Status
Area	Dipertahankan
Perimeter	Drop (Redundan)
MajorAxisLength	Drop (Redundan)
MinorAxisLength	Drop (Redundan)
AspectRation	Dipertahankan
Eccentricity	Drop (Redundan)
ConvexArea	Drop (Redundan)
EquivDiameter	Drop (Redundan)
Extent	Dipertahankan
Solidity	Dipertahankan
roundness	Dipertahankan
Compactness	Drop (Redundan)
ShapeFactor1	Drop (Redundan)
ShapeFactor2	Drop (Redundan)
ShapeFactor3	Drop (Redundan)
ShapeFactor4	Dipertahankan

5.2 PCA: Analisis Komponen Utama

df_seleksi <- df_clean %>% select(all_of(fitur_terpilih))

pca_result <- prcomp(df_seleksi, center = TRUE, scale. = TRUE)

var_explained <- summary(pca_result)$importance
var_df <- data.frame(
  PC              = paste0("PC", 1:ncol(var_explained)),
  SD              = round(var_explained[1, ], 4),
  Proporsi_Var    = round(var_explained[2, ], 4),
  Kumulatif_Var   = round(var_explained[3, ], 4)
)

var_df %>%
  kable(caption = "Variansi yang Dijelaskan oleh Tiap Komponen Utama (PCA)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(which(var_df$Kumulatif_Var >= 0.90)[1],
           background = "#eafaf1", color = "#1e8449")

Variansi yang Dijelaskan oleh Tiap Komponen Utama (PCA)
	PC	SD	Proporsi_Var	Kumulatif_Var
PC1	PC1	1.7837	0.5302	0.5302
PC2	PC2	1.0328	0.1778	0.7080
PC3	PC3	0.8542	0.1216	0.8296
PC4	PC4	0.7082	0.0836	0.9132
PC5	PC5	0.6681	0.0744	0.9876
PC6	PC6	0.2727	0.0124	1.0000

5.3 Scree Plot PCA

fviz_eig(
  pca_result,
  addlabels = TRUE,
  ylim      = c(0, max(var_explained[2, ] * 100) + 5),
  barfill   = "#3498db",
  barcolor  = "white",
  linecolor = "#e74c3c",
  main      = "Variansi Tiap Komponen Utama (PCA)"
) +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

5.4 Biplot PCA

fviz_pca_biplot(
  pca_result,
  geom.ind   = "point",
  col.ind    = df_clean$Class,
  palette    = c("#2ecc71","#3498db","#e74c3c","#f39c12",
                 "#9b59b6","#1abc9c","#e67e22"),
  addEllipses = TRUE,
  ellipse.level = 0.90,
  col.var    = "black",
  repel      = TRUE,
  legend.title = "Kelas",
  title      = "Fitur & Distribusi Kelas (PCA)"
) +
  theme_minimal(base_size = 11) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

## Ignoring unknown labels:
## • linetype : "Kelas"

## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 7 values. Consider specifying shapes manually if you need
##   that many of them.

## Warning: Removed 2636 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

5.5 Keputusan: Pilih Pendekatan

n_pc_90 <- min(which(var_df$Kumulatif_Var >= 0.90))

cat("PC yang dibutuhkan untuk menjelaskan >= 90% variansi:", n_pc_90, "\n")

## PC yang dibutuhkan untuk menjelaskan >= 90% variansi: 4

cat("Jumlah fitur setelah seleksi manual:", length(fitur_terpilih), "\n\n")

## Jumlah fitur setelah seleksi manual: 6

if (n_pc_90 < length(fitur_terpilih)) {
  cat("Rekomendasi: Gunakan PCA dengan", n_pc_90, "komponen untuk clustering\n")
} else {
  cat("Rekomendasi: Gunakan seleksi manual (", length(fitur_terpilih), "fitur) — PCA tidak memberikan reduksi yang signifikan\n")
}

## Rekomendasi: Gunakan PCA dengan 4 komponen untuk clustering

pc_scores <- as.data.frame(pca_result$x[, 1:n_pc_90])
pc_scores$Class <- df_clean$Class

cat("Dimensi data PC scores:", nrow(pc_scores), "baris x",
    ncol(pc_scores) - 1, "komponen + kolom Class\n")

## Dimensi data PC scores: 13543 baris x 4 komponen + kolom Class

Tahap 6: Pisahkan Kolom Label (Class)

6.1 Pisahkan Fitur dan Label

X_clustering <- df_clean %>% select(all_of(fitur_terpilih))
y_label      <- df_clean$Class

cat("Data untuk Clustering\n")

## Data untuk Clustering

cat("\nShape fitur (X):", nrow(X_clustering), "x", ncol(X_clustering), "\n")

## 
## Shape fitur (X): 13543 x 6

cat("Panjang label (y):", length(y_label), "\n")

## Panjang label (y): 13543

cat("Label Class tersedia:", n_distinct(y_label), "kelas unik\n")

## Label Class tersedia: 7 kelas unik

cat("Nama kelas:", paste(unique(y_label), collapse = ", "), "\n\n")

## Nama kelas: SEKER, BARBUNYA, BOMBAY, CALI, HOROZ, SIRA, DERMASON

tabel_pisah <- data.frame(
  Objek    = c("X_clustering", "y_label"),
  Isi      = c(
    paste0(ncol(X_clustering), " fitur numerik terpilih"),
    "Kolom Class"
  ),
  Dipakai_Saat = c(
    "Training clustering",
    "Evaluasi hasil clustering"
  ),
  Dimensi  = c(
    paste0(nrow(X_clustering), " x ", ncol(X_clustering)),
    paste0(length(y_label), " baris")
  )
)

tabel_pisah %>%
  kable(caption = "Pemisahan Fitur dan Label untuk Clustering") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(1, background = "#eafaf1", color = "#1e8449") %>%
  row_spec(2, background = "#fef9e7", color = "#b7770d")

Pemisahan Fitur dan Label untuk Clustering
Objek	Isi	Dipakai_Saat	Dimensi
X_clustering	6 fitur numerik terpilih	Training clustering	13543 x 6
y_label	Kolom Class	Evaluasi hasil clustering	13543 baris

Tahap 7: Normalisasi / Standarisasi Fitur

7.1 Alasan Normalisasi

range_sebelum <- X_clustering %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  group_by(Fitur) %>%
  summarise(
    Min    = round(min(Nilai), 2),
    Max    = round(max(Nilai), 2),
    Range  = round(max(Nilai) - min(Nilai), 2),
    Mean   = round(mean(Nilai), 2),
    SD     = round(sd(Nilai), 2),
    .groups = "drop"
  ) %>%
  arrange(desc(Range))

range_sebelum %>%
  kable(caption = "Range Nilai Fitur Sebelum Normalisasi") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(1:3, background = "#fde8e8", color = "#c0392b")

Range Nilai Fitur Sebelum Normalisasi
Fitur	Min	Max	Range	Mean	SD
Area	20420.00	99031.25	78611.25	50166.95	18864.23
AspectRation	1.02	2.11	1.09	1.58	0.24
roundness	0.71	0.99	0.28	0.87	0.06
Extent	0.62	0.87	0.25	0.75	0.05
Solidity	0.98	0.99	0.02	0.99	0.00
ShapeFactor4	0.99	1.00	0.01	1.00	0.00

cat("\nFitur dengan range terbesar:",
    range_sebelum$Fitur[1], "(range =", range_sebelum$Range[1], ")\n")

## 
## Fitur dengan range terbesar: Area (range = 78611.25 )

cat("Fitur dengan range terkecil:",
    range_sebelum$Fitur[nrow(range_sebelum)],
    "(range =", range_sebelum$Range[nrow(range_sebelum)], ")\n")

## Fitur dengan range terkecil: ShapeFactor4 (range = 0.01 )

cat("\nPerbedaan skala yang besar akan mendominasi perhitungan jarak pada clustering\n")

## 
## Perbedaan skala yang besar akan mendominasi perhitungan jarak pada clustering

7.2 Standardisasi Z-score (StandardScaler)

X_scaled_zscore <- X_clustering %>%
  mutate(across(everything(), ~ (. - mean(.)) / sd(.)))

zscore_check <- X_scaled_zscore %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  group_by(Fitur) %>%
  summarise(
    Mean_Baru = round(mean(Nilai), 6),
    SD_Baru   = round(sd(Nilai), 6),
    Min_Baru  = round(min(Nilai), 4),
    Max_Baru  = round(max(Nilai), 4),
    .groups   = "drop"
  )

zscore_check %>%
  kable(caption = "Verifikasi Hasil Z-score Standardization") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(which(abs(zscore_check$Mean_Baru) < 1e-4 & abs(zscore_check$SD_Baru - 1) < 1e-4),
           background = "#eafaf1", color = "#1e8449")

Verifikasi Hasil Z-score Standardization
Fitur	SD_Baru	Min_Baru	Max_Baru
Area	1	-1.5769	2.5903
AspectRation	1	-2.3166	2.2397
Extent	1	-2.7860	2.4199
ShapeFactor4	1	-2.4066	1.3271
Solidity	1	-2.3219	2.0201
roundness	1	-2.8469	2.0014

cat("\nMean semua fitur (harus ~0):",
    round(mean(as.matrix(X_scaled_zscore)), 6), "\n")

## 
## Mean semua fitur (harus ~0): 0

cat("SD semua fitur (harus ~1):",
    round(mean(apply(X_scaled_zscore, 2, sd)), 6), "\n")

## SD semua fitur (harus ~1): 1

7.3 Min-Max Normalization (MinMaxScaler)

X_scaled_minmax <- X_clustering %>%
  mutate(across(everything(), ~ (. - min(.)) / (max(.) - min(.))))

minmax_check <- X_scaled_minmax %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  group_by(Fitur) %>%
  summarise(
    Min_Baru = round(min(Nilai), 6),
    Max_Baru = round(max(Nilai), 6),
    Mean_Baru = round(mean(Nilai), 4),
    .groups = "drop"
  )

minmax_check %>%
  kable(caption = "Verifikasi Hasil Min-Max Normalization (Rentang [0,1])") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(which(minmax_check$Min_Baru == 0 & minmax_check$Max_Baru == 1),
           background = "#eafaf1", color = "#1e8449")

Verifikasi Hasil Min-Max Normalization (Rentang [0,1])
Fitur	Max_Baru	Mean_Baru
Area	1	0.3784
AspectRation	1	0.5084
Extent	1	0.5352
ShapeFactor4	1	0.6446
Solidity	1	0.5348
roundness	1	0.5872

7.4 Perbandingan Distribusi: Sebelum vs Sesudah Normalisasi

fitur_sampel <- fitur_terpilih[1:min(6, length(fitur_terpilih))]

df_vis <- bind_rows(
  X_clustering %>%
    select(all_of(fitur_sampel)) %>%
    pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
    mutate(Kondisi = "Sebelum Normalisasi"),

  X_scaled_zscore %>%
    select(all_of(fitur_sampel)) %>%
    pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
    mutate(Kondisi = "Z-score"),

  X_scaled_minmax %>%
    select(all_of(fitur_sampel)) %>%
    pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
    mutate(Kondisi = "Min-Max")
) %>%
  mutate(
    Kondisi = factor(Kondisi,
                     levels = c("Sebelum Normalisasi", "Z-score", "Min-Max")),
    Fitur   = factor(Fitur, levels = fitur_sampel)
  )

ggplot(df_vis, aes(x = Nilai, fill = Kondisi)) +
  geom_histogram(bins = 40, color = "white", alpha = 0.8) +
  scale_fill_manual(values = c("#e74c3c", "#3498db", "#2ecc71")) +
  facet_grid(Kondisi ~ Fitur, scales = "free") +
  labs(
    title    = "PeSebelum vs Sesudah Normalisasi",
    subtitle = paste0("Ditampilkan ", length(fitur_sampel),
                      " fitur pertama (skala bebas per panel)"),
    x = "Nilai", y = "Frekuensi",
    fill = "Metode"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    strip.text    = element_text(face = "bold", size = 9),
    legend.position = "bottom"
  )

7.5 Pilih Metode Normalisasi untuk Clustering

X_final <- X_scaled_zscore
cat("Data Final untuk Clustering\n")

## Data Final untuk Clustering

cat("\nMetode normalisasi: Z-score (StandardScaler)\n")

## 
## Metode normalisasi: Z-score (StandardScaler)

cat("Shape X_final:", nrow(X_final), "x", ncol(X_final), "\n")

## Shape X_final: 13543 x 6

cat("Kolom:", paste(names(X_final), collapse = ", "), "\n\n")

## Kolom: Area, AspectRation, Extent, Solidity, roundness, ShapeFactor4

cat("Statistik X_final:\n")

## Statistik X_final:

print(round(sapply(X_final, function(x) c(mean=mean(x), sd=sd(x), min=min(x), max=max(x))), 4))

##         Area AspectRation  Extent Solidity roundness ShapeFactor4
## mean  0.0000       0.0000  0.0000   0.0000    0.0000       0.0000
## sd    1.0000       1.0000  1.0000   1.0000    1.0000       1.0000
## min  -1.5769      -2.3166 -2.7860  -2.3219   -2.8469      -2.4066
## max   2.5903       2.2397  2.4199   2.0201    2.0014       1.3271

8.1 Penentuan Jumlah Kluster Optimal

8.1.1 Metode Elbow (WSS)

set.seed(42)
wss_values <- sapply(1:10, function(k) {
  kmeans(X_final, centers = k, nstart = 10, iter.max = 100)$tot.withinss
})

wss_df <- data.frame(k = 1:10, WSS = wss_values)

delta_wss  <- diff(wss_values)        
pct_drop   <- abs(delta_wss) / wss_values[-length(wss_values)] * 100
elbow_k    <- which(pct_drop < 10)[1] + 1   
elbow_k    <- if (is.na(elbow_k)) 7 else elbow_k 

ggplot(wss_df, aes(x = k, y = WSS)) +
  geom_line(color = "#3498db", linewidth = 1.2) +
  geom_point(color = "#e74c3c", size = 4) +
  geom_vline(xintercept = elbow_k, linetype = "dashed",
             color = "#e74c3c", linewidth = 0.8) +
  annotate("text", x = elbow_k + 0.35, y = max(wss_values) * 0.85,
           label = paste0("k = ", elbow_k, " (siku)"),
           color = "#e74c3c", size = 4, fontface = "italic") +
  scale_x_continuous(breaks = 1:10) +
  labs(
    title    = "Metode Elbow — Penentuan Jumlah Kluster Optimal (K-Means)",
    subtitle = "Cari titik 'siku' di mana WSS mulai melandai",
    x = "Jumlah Kluster (k)", y = "Total Within-Cluster SS (WSS)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

8.1.2 Metode Silhouette

set.seed(42)
samp_idx_sil <- sample(nrow(X_final), min(2000, nrow(X_final)))
X_sil_sample <- X_final[samp_idx_sil, ]

sil_values_km <- sapply(2:8, function(k) {
  km  <- kmeans(X_final, centers = k, nstart = 10, iter.max = 100)
  sil <- silhouette(km$cluster[samp_idx_sil], dist(X_sil_sample))
  mean(sil[, 3])
})

sil_df_km <- data.frame(k = 2:8, Silhouette = sil_values_km)
best_k_km <- sil_df_km$k[which.max(sil_df_km$Silhouette)]

ggplot(sil_df_km, aes(x = k, y = Silhouette)) +
  geom_line(color = "#2ecc71", linewidth = 1.2) +
  geom_point(aes(color = k == best_k_km), size = 4, show.legend = FALSE) +
  scale_color_manual(values = c("FALSE" = "#2ecc71", "TRUE" = "#e74c3c")) +
  geom_vline(xintercept = best_k_km, linetype = "dashed",
             color = "#e74c3c", linewidth = 0.8) +
  annotate("text", x = best_k_km + 0.3,
           y = max(sil_values_km) * 0.97,
           label = paste0("k = ", best_k_km, " (optimal)"),
           color = "#e74c3c", size = 4, fontface = "italic") +
  scale_x_continuous(breaks = 2:8) +
  labs(
    title    = "Metode Silhouette — Penentuan Jumlah Kluster Optimal (K-Means)",
    subtitle = "Skor lebih tinggi = kluster lebih kohesif & terpisah",
    x = "Jumlah Kluster (k)", y = "Rata-rata Silhouette Score"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

cat("k optimal berdasarkan Silhouette:", best_k_km, "\n")

## k optimal berdasarkan Silhouette: 2

8.1.3 Keputusan k

k_optimal <- 7

cat(" Keputusan: k =", k_optimal, "\n")

##  Keputusan: k = 7

cat(" Alasan   : Sesuai jumlah kelas asli (7 jenis kacang)\n")

##  Alasan   : Sesuai jumlah kelas asli (7 jenis kacang)

cat("            + dikonfirmasi Elbow & Silhouette\n")

##             + dikonfirmasi Elbow & Silhouette

8.2 Training K-Means

set.seed(42)

km_result <- kmeans(
  X_final,
  centers   = k_optimal,
  nstart    = 25,      
  iter.max  = 300,
  algorithm = "Hartigan-Wong"
)

km_labels <- km_result$cluster

cat("K-Means selesai!\n\n")

## K-Means selesai!

cat("Algoritma          : Hartigan-Wong\n")

## Algoritma          : Hartigan-Wong

cat("Metrik jarak       : Euclidean (L2 norm)\n")

## Metrik jarak       : Euclidean (L2 norm)

cat("Pusat kluster      : Mean (rata-rata koordinat)\n")

## Pusat kluster      : Mean (rata-rata koordinat)

cat("Jumlah kluster     :", nrow(km_result$centers), "\n")

## Jumlah kluster     : 7

cat("Iterasi konvergen  :", km_result$iter, "\n")

## Iterasi konvergen  : 6

cat("Total WSS          :", round(km_result$tot.withinss, 2), "\n")

## Total WSS          : 25231.11

cat("Between-SS/Total-SS:",
    round(km_result$betweenss / km_result$totss * 100, 2), "%\n\n")

## Between-SS/Total-SS: 68.95 %

cat("Distribusi anggota per kluster:\n")

## Distribusi anggota per kluster:

print(table(km_labels))

## km_labels
##    1    2    3    4    5    6    7 
## 2788 1408 2722 2484 1091 1179 1871

8.3 Evaluasi Internal K-Means

8.3.1 Silhouette Plot

set.seed(42)
samp_km    <- sample(nrow(X_final), min(2000, nrow(X_final)))
sil_km     <- silhouette(km_labels[samp_km], dist(X_final[samp_km, ]))
avg_sil_km <- mean(sil_km[, 3])

fviz_silhouette(sil_km,
                palette       = "jco",
                ggtheme       = theme_minimal(base_size = 11),
                print.summary = FALSE) +
  geom_hline(yintercept = avg_sil_km, linetype = "dashed",
             color = "#e74c3c", linewidth = 0.8) +
  labs(
    title    = "K-Means Clustering (Silhouette)",
    subtitle = paste0("Rata-rata Silhouette Score = ", round(avg_sil_km, 4),
                      " | Range: -1 (buruk) hingga 1 (sempurna)")
  ) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

8.3.2 Tabel Metrik Internal

eval_km <- data.frame(
  Metrik     = c("Silhouette Score",
                 "Between-SS / Total-SS (%)",
                 "Total Within-SS",
                 "Iterasi Konvergen"),
  Nilai      = c(round(avg_sil_km, 4),
                 round(km_result$betweenss / km_result$totss * 100, 2),
                 round(km_result$tot.withinss, 2),
                 km_result$iter),
  Keterangan = c("Mendekati 1 = kluster padat & terpisah",
                 "Semakin tinggi = pemisahan kluster lebih baik",
                 "Semakin kecil = kluster lebih kompak",
                 "Cepat konvergen = solusi stabil")
)

eval_km %>%
  kable(caption = "Evaluasi Internal — K-Means Clustering") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(1, background = "#eafaf1", color = "#1e8449")

Evaluasi Internal — K-Means Clustering
Metrik	Nilai	Keterangan
Silhouette Score	0.2688	Mendekati 1 = kluster padat & terpisah
Between-SS / Total-SS (%)	68.9500	Semakin tinggi = pemisahan kluster lebih baik
Total Within-SS	25231.1100	Semakin kecil = kluster lebih kompak
Iterasi Konvergen	6.0000	Cepat konvergen = solusi stabil

8.4 Visualisasi Hasil K-Means

8.4.1 Cluster Plot (PCA 2D)

warna_kluster <- c("#2ecc71","#3498db","#e74c3c","#f39c12",
                   "#9b59b6","#1abc9c","#e67e22")

fviz_cluster(
  km_result,
  data         = X_final,
  palette      = warna_kluster,
  geom         = "point",
  ellipse.type = "convex",
  alpha        = 0.4,
  pointsize    = 1,
  ggtheme      = theme_minimal(base_size = 12)
) +
  labs(
    title    = "K-Means Clustering (PCA 2D)",
    subtitle = paste0("k = ", k_optimal, "Proyeksi ke 2 PC pertama"),
    color = "Kluster", fill = "Kluster", shape = "Kluster"
  ) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

8.4.2 Distribusi Anggota per Kluster

dist_km <- data.frame(
  Kluster = paste0("K", sort(unique(km_labels))),
  Jumlah  = as.integer(table(km_labels))
) %>%
  mutate(
    Persentase = percent(Jumlah / sum(Jumlah), accuracy = 0.1),
    Kluster    = reorder(Kluster, -Jumlah)
  )

ggplot(dist_km, aes(x = Kluster, y = Jumlah, fill = Kluster)) +
  geom_col(width = 0.7, show.legend = FALSE) +
  geom_text(aes(label = paste0(Jumlah, "\n", Persentase)),
            vjust = -0.3, size = 3.8, fontface = "bold") +
  scale_fill_manual(values = warna_kluster) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.18))) +
  labs(title = "Distribusi Anggota per Kluster — K-Means",
       x = "Kluster", y = "Jumlah Anggota") +
  theme_minimal(base_size = 12) +
  theme(plot.title  = element_text(face = "bold", hjust = 0.5),
        axis.text.x = element_text(face = "bold"))

8.4.3 Heatmap Profil Centroid

centroid_km <- as.data.frame(km_result$centers) %>%
  mutate(Kluster = paste0("K", 1:k_optimal)) %>%
  pivot_longer(-Kluster, names_to = "Fitur", values_to = "Nilai")

ggplot(centroid_km, aes(x = Fitur, y = Kluster, fill = Nilai)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = round(Nilai, 2)), size = 3.2, fontface = "bold") +
  scale_fill_gradient2(low = "#e74c3c", mid = "white", high = "#2ecc71",
                       midpoint = 0, name = "Z-score") +
  labs(
    title    = "Profil Centroid K-Means (Z-score)",
    subtitle = "Centroid = rata-rata koordinat semua titik di kluster | Merah = rendah | Hijau = tinggi",
    x = "Fitur", y = "Kluster"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    axis.text.x   = element_text(angle = 35, hjust = 1, face = "bold")
  )

8.5 Evaluasi Eksternal K-Means

purity_score <- function(labels_true, labels_pred) {
  tbl <- table(labels_true, labels_pred)
  sum(apply(tbl, 2, max)) / length(labels_true)
}

purity_km <- purity_score(y_label, km_labels)

table(Kluster = km_labels, Kelas_Asli = y_label) %>%
  as.data.frame.matrix() %>%
  rownames_to_column("Kluster") %>%
  mutate(Kluster = paste0("K", Kluster)) %>%
  kable(caption = "Tabel Kontingensi — K-Means vs Kelas Asli") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")

Tabel Kontingensi — K-Means vs Kelas Asli
Kluster	BARBUNYA	BOMBAY	CALI	DERMASON	HOROZ	SEKER	SIRA
K1	52	0	16	1407	15	95	1203
K2	76	127	710	40	296	4	155
K3	55	0	21	1456	24	52	1114
K4	3	1	2	583	0	1846	49
K5	943	0	15	52	20	30	31
K6	18	2	65	8	1036	0	50
K7	175	392	801	0	469	0	34

cat("\nPurity Score K-Means:", round(purity_km, 4),
    paste0("(", round(purity_km * 100, 2), "%)"), "\n")

## 
## Purity Score K-Means: 0.6054 (60.54%)

8.6 Kesimpulan K-Means

data.frame(
  Aspek  = c("Algoritma", "Pusat Kluster", "Metrik Jarak",
             "Jumlah Kluster", "Silhouette Score",
             "Between-SS / Total-SS", "Purity Score", "Status"),
  Hasil  = c("K-Means (Hartigan-Wong)",
             "Mean / rata-rata koordinat (bukan titik data aktual)",
             "Euclidean (L2 norm)",
             paste0(k_optimal, " kluster"),
             round(avg_sil_km, 4),
             paste0(round(km_result$betweenss / km_result$totss * 100, 2), "%"),
             round(purity_km, 4),
             "lanjut ke Tahap K-Median")
) %>%
  kable(caption = "Kesimpulan Tahap 8: K-Means Clustering") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(8, background = "#eafaf1", color = "#1e8449")

Kesimpulan Tahap 8: K-Means Clustering
Aspek	Hasil
Algoritma	K-Means (Hartigan-Wong)
Pusat Kluster	Mean / rata-rata koordinat (bukan titik data aktual)
Metrik Jarak	Euclidean (L2 norm)
Jumlah Kluster	7 kluster
Silhouette Score	0.2688
Between-SS / Total-SS	68.95%
Purity Score	0.6054
Status	lanjut ke Tahap K-Median

Tahap 9: K-Median Clustering

9.1 Penentuan Jumlah Kluster Optimal

9.1.1 Metode Elbow (Total Dissimilarity PAM)

set.seed(42)
dissim_values <- sapply(2:8, function(k) {
  cl <- clara(X_final, k = k, metric = "manhattan", samples = 200, pamLike = TRUE)
  cl$objective
})

dissim_df <- data.frame(k = 2:8, Dissimilarity = dissim_values)

ggplot(dissim_df, aes(x = k, y = Dissimilarity)) +
  geom_line(color = "#9b59b6", linewidth = 1.2) +
  geom_point(color = "#e74c3c", size = 4) +
  geom_vline(xintercept = 7, linetype = "dashed",
             color = "#e74c3c", linewidth = 0.8) +
  annotate("text", x = 7.3,
           y = max(dissim_values) * 0.95,
           label = "k = 7 (elbow)", color = "#e74c3c",
           size = 4, fontface = "italic") +
  scale_x_continuous(breaks = 2:8) +
  labs(
    title    = "Metode Elbow — Penentuan Jumlah Kluster Optimal (K-Median/PAM)",
    subtitle = "Total Dissimilarity Manhattan — titik siku = k optimal",
    x = "Jumlah Kluster (k)", y = "Total Dissimilarity (swap)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

9.1.2 Metode Silhouette

set.seed(42)

sil_values_pam <- sapply(2:8, function(k) {
  cl <- clara(X_final, k = k, metric = "manhattan", samples = 200, pamLike = TRUE)
  mean(cl$silinfo$avg.width)
})

sil_df_pam <- data.frame(k = 2:8, Silhouette = sil_values_pam)
best_k_pam <- sil_df_pam$k[which.max(sil_df_pam$Silhouette)]

ggplot(sil_df_pam, aes(x = k, y = Silhouette)) +
  geom_line(color = "#9b59b6", linewidth = 1.2) +
  geom_point(aes(color = k == best_k_pam), size = 4, show.legend = FALSE) +
  scale_color_manual(values = c("FALSE" = "#9b59b6", "TRUE" = "#e74c3c")) +
  geom_vline(xintercept = best_k_pam, linetype = "dashed",
             color = "#e74c3c", linewidth = 0.8) +
  annotate("text", x = best_k_pam + 0.3,
           y = max(sil_values_pam) * 0.97,
           label = paste0("k = ", best_k_pam, " (optimal)"),
           color = "#e74c3c", size = 4, fontface = "italic") +
  scale_x_continuous(breaks = 2:8) +
  labs(
    title    = "Metode Silhouette — Penentuan Jumlah Kluster Optimal (K-Median/PAM)",
    subtitle = "Skor lebih tinggi = kluster lebih kohesif & terpisah",
    x = "Jumlah Kluster (k)", y = "Rata-rata Silhouette Score"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

cat("k optimal berdasarkan Silhouette (K-Median/PAM):", best_k_pam, "\n")

## k optimal berdasarkan Silhouette (K-Median/PAM): 2

9.2 Training K-Median (PAM)

set.seed(42)

pam_result <- clara(
  X_final,
  k       = k_optimal,
  metric  = "manhattan",  
  samples = 200,          
  pamLike = TRUE,         
  stand   = FALSE         
)

pam_labels <- pam_result$clustering

cat("K-Median (CLARA) selesai!\n\n")

## K-Median (CLARA) selesai!

cat("Algoritma          : CLARA (Clustering Large Applications)\n")

## Algoritma          : CLARA (Clustering Large Applications)

cat("Setara dengan      : PAM / K-Median untuk dataset besar\n")

## Setara dengan      : PAM / K-Median untuk dataset besar

cat("Metrik jarak       : Manhattan (L1 norm) — sesuai PPT K-Median\n")

## Metrik jarak       : Manhattan (L1 norm) — sesuai PPT K-Median

cat("Pusat kluster      : Medoid (objek data aktual, bukan rata-rata)\n")

## Pusat kluster      : Medoid (objek data aktual, bukan rata-rata)

cat("Jumlah kluster     :", k_optimal, "\n")

## Jumlah kluster     : 7

cat("Objective          :", round(pam_result$objective, 4), "\n\n")

## Objective          : 2.6052

cat("Distribusi anggota per kluster:\n")

## Distribusi anggota per kluster:

print(table(pam_labels))

## pam_labels
##    1    2    3    4    5    6    7 
## 1956 3287 2695 1183  829 2051 1542

9.3 Evaluasi Internal K-Median

9.3.1 Silhouette Plot

set.seed(42)
samp_pam    <- sample(nrow(X_final), min(2000, nrow(X_final)))
sil_pam     <- silhouette(pam_labels[samp_pam], dist(X_final[samp_pam, ], method = "manhattan"))
avg_sil_pam <- mean(sil_pam[, 3])

fviz_silhouette(sil_pam,
                palette       = "jco",
                ggtheme       = theme_minimal(base_size = 11),
                print.summary = FALSE) +
  geom_hline(yintercept = avg_sil_pam, linetype = "dashed",
             color = "#9b59b6", linewidth = 0.8) +
  labs(
    title    = "Silhouette Plot — K-Median (PAM) Clustering",
    subtitle = paste0("Rata-rata Silhouette Score = ", round(avg_sil_pam, 4),
                      " | Manhattan distance")
  ) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

9.3.2 Tabel Metrik Internal

eval_pam <- data.frame(
  Metrik     = c("Silhouette Score (↑ lebih baik)",
                 "Objective Function (swap) (↓ lebih baik)",
                 "Metrik Jarak",
                 "Representasi Pusat Kluster"),
  Nilai      = c(round(avg_sil_pam, 4),
                 round(pam_result$objective, 4),
                 "Manhattan (L1 norm)",
                 "Medoid (objek data aktual)"),
  Keterangan = c("Mendekati 1 = kluster padat & terpisah",
                 "Semakin kecil = kluster lebih kompak",
                 "Sum of absolute differences — robust terhadap outlier",
                 "Lebih interpretatif dari centroid K-Means")
)

eval_pam %>%
  kable(caption = "Evaluasi Internal — K-Median (PAM) Clustering") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(1, background = "#f5eef8", color = "#6c3483")

Evaluasi Internal — K-Median (PAM) Clustering
Metrik	Nilai	Keterangan
Silhouette Score (↑ lebih baik)	0.2415	Mendekati 1 = kluster padat & terpisah
Objective Function (swap) (↓ lebih baik)	2.6052	Semakin kecil = kluster lebih kompak
Metrik Jarak	Manhattan (L1 norm)	Sum of absolute differences — robust terhadap outlier
Representasi Pusat Kluster	Medoid (objek data aktual)	Lebih interpretatif dari centroid K-Means

9.4 Visualisasi Hasil K-Median

9.4.1 Cluster Plot (PCA 2D)

fviz_cluster(
  pam_result,
  data         = X_final,
  palette      = warna_kluster,
  geom         = "point",
  ellipse.type = "convex",
  alpha        = 0.4,
  pointsize    = 1,
  ggtheme      = theme_minimal(base_size = 12)
) +
  labs(
    title    = "Visualisasi K-Median (PAM) Clustering (PCA 2D)",
    subtitle = paste0("k = ", k_optimal,
                      " kluster | Manhattan distance | Proyeksi ke 2 PC pertama"),
    color = "Kluster", fill = "Kluster", shape = "Kluster"
  ) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

9.4.2 Distribusi Anggota per Kluster

dist_pam <- data.frame(
  Kluster = paste0("K", sort(unique(pam_labels))),
  Jumlah  = as.integer(table(pam_labels))
) %>%
  mutate(
    Persentase = percent(Jumlah / sum(Jumlah), accuracy = 0.1),
    Kluster    = reorder(Kluster, -Jumlah)
  )

ggplot(dist_pam, aes(x = Kluster, y = Jumlah, fill = Kluster)) +
  geom_col(width = 0.7, show.legend = FALSE) +
  geom_text(aes(label = paste0(Jumlah, "\n", Persentase)),
            vjust = -0.3, size = 3.8, fontface = "bold") +
  scale_fill_manual(values = warna_kluster) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.18))) +
  labs(title = "Distribusi Anggota per Kluster — K-Median (PAM)",
       x = "Kluster", y = "Jumlah Anggota") +
  theme_minimal(base_size = 12) +
  theme(plot.title  = element_text(face = "bold", hjust = 0.5),
        axis.text.x = element_text(face = "bold"))

9.4.3 Heatmap Profil Medoid

medoid_df <- as.data.frame(pam_result$medoids) %>%
  mutate(Kluster = paste0("K", 1:k_optimal)) %>%
  pivot_longer(-Kluster, names_to = "Fitur", values_to = "Nilai")

ggplot(medoid_df, aes(x = Fitur, y = Kluster, fill = Nilai)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = round(Nilai, 2)), size = 3.2, fontface = "bold") +
  scale_fill_gradient2(low = "#e74c3c", mid = "white", high = "#9b59b6",
                       midpoint = 0, name = "Z-score") +
  labs(
    title    = "Profil Medoid K-Median/PAM (Z-score)",
    subtitle = "Medoid = objek data aktual paling representatif | Merah = rendah | Ungu = tinggi",
    x = "Fitur", y = "Kluster"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    axis.text.x   = element_text(angle = 35, hjust = 1, face = "bold")
  )

9.5 Evaluasi Eksternal K-Median

purity_pam <- purity_score(y_label, pam_labels)

table(Kluster = pam_labels, Kelas_Asli = y_label) %>%
  as.data.frame.matrix() %>%
  rownames_to_column("Kluster") %>%
  mutate(Kluster = paste0("K", Kluster)) %>%
  kable(caption = "Tabel Kontingensi — K-Median (PAM) vs Kelas Asli") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")

Tabel Kontingensi — K-Median (PAM) vs Kelas Asli
Kluster	BARBUNYA	BOMBAY	CALI	DERMASON	HOROZ	SEKER	SIRA
K1	0	8	1	244	0	1685	18
K2	49	29	13	1961	17	271	947
K3	118	19	24	1242	36	49	1207
K4	74	123	635	38	203	4	106
K5	750	0	2	32	18	16	11
K6	319	340	897	21	179	2	293
K7	12	3	58	8	1407	0	54

cat("\nPurity Score K-Median (PAM):", round(purity_pam, 4),
    paste0("(", round(purity_pam * 100, 2), "%)"), "\n")

## 
## Purity Score K-Median (PAM): 0.6333 (63.33%)

9.6 Kesimpulan K-Median

data.frame(
  Aspek  = c("Algoritma", "Pusat Kluster", "Metrik Jarak",
             "Jumlah Kluster", "Silhouette Score",
             "Objective (swap)", "Purity Score", "Status"),
  Hasil  = c("K-Median via CLARA (Clustering Large Applications)",
             "Medoid (objek data aktual — bukan rata-rata)",
             "Manhattan (L1 norm) — sesuai PPT",
             paste0(k_optimal, " kluster"),
             round(avg_sil_pam, 4),
             round(pam_result$objective, 4),
             round(purity_pam, 4),
             "lanjut ke Tahap Perbandingan")
) %>%
  kable(caption = "Kesimpulan Tahap 9: K-Median (PAM) Clustering") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(8, background = "#f5eef8", color = "#6c3483")

Kesimpulan Tahap 9: K-Median (PAM) Clustering
Aspek	Hasil
Algoritma	K-Median via CLARA (Clustering Large Applications)
Pusat Kluster	Medoid (objek data aktual — bukan rata-rata)
Metrik Jarak	Manhattan (L1 norm) — sesuai PPT
Jumlah Kluster	7 kluster
Silhouette Score	0.2415
Objective (swap)	2.6052
Purity Score	0.6333
Status	lanjut ke Tahap Perbandingan

Tahap 10: DBSCAN Clustering

10.1 Penentuan Parameter eps (k-distance plot)

set.seed(42)
samp_db <- sample(nrow(X_final), min(3000, nrow(X_final)))
X_db_sample <- X_final[samp_db, ]

minPts <- ncol(X_final) * 2  
kNNdistplot(X_db_sample, k = minPts)
abline(h = 1.5, col = "red", lty = 2, lwd = 2)
title(main = "k-distance Plot — Penentuan Nilai eps DBSCAN",
      sub  = paste0("Garis merah = kandidat eps | minPts = ", minPts))

10.2 Training DBSCAN

set.seed(42)

eps_val    <- 1.5
minPts_val <- ncol(X_final) * 2

db_result <- dbscan::dbscan(X_final, eps = eps_val, minPts = minPts_val)
db_labels <- db_result$cluster  # 0 = noise

cat("DBSCAN selesai!\n\n")

## DBSCAN selesai!

cat("Parameter eps     :", eps_val, "\n")

## Parameter eps     : 1.5

cat("Parameter minPts  :", minPts_val, "\n")

## Parameter minPts  : 12

cat("Jumlah kluster    :", max(db_labels), "\n")

## Jumlah kluster    : 1

cat("Titik noise (0)   :", sum(db_labels == 0), "\n\n")

## Titik noise (0)   : 0

cat("Distribusi label:\n")

## Distribusi label:

print(table(db_labels))

## db_labels
##     1 
## 13543

10.3 Evaluasi Internal DBSCAN

set.seed(42)
non_noise_idx <- which(db_labels != 0)

if (length(unique(db_labels[non_noise_idx])) >= 2) {
  samp_db2   <- sample(non_noise_idx, min(2000, length(non_noise_idx)))
  sil_db     <- silhouette(db_labels[samp_db2], dist(X_final[samp_db2, ]))
  avg_sil_db <- mean(sil_db[, 3])
  
  fviz_silhouette(sil_db,
                  palette       = "jco",
                  ggtheme       = theme_minimal(base_size = 11),
                  print.summary = FALSE) +
    geom_hline(yintercept = avg_sil_db, linetype = "dashed",
               color = "#e74c3c", linewidth = 0.8) +
    labs(
      title    = "Silhouette Plot — DBSCAN Clustering",
      subtitle = paste0("Rata-rata Silhouette Score = ", round(avg_sil_db, 4),
                        " | Noise points dikecualikan")
    ) +
    theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
          plot.subtitle = element_text(hjust = 0.5, color = "gray50"))
} else {
  avg_sil_db <- NA
  cat("Tidak cukup kluster untuk menghitung Silhouette.\n")
}

## Tidak cukup kluster untuk menghitung Silhouette.

10.4 Visualisasi DBSCAN (PCA 2D)

pca_2d <- as.data.frame(prcomp(X_final, center = TRUE, scale. = TRUE)$x[, 1:2])
pca_2d$Kluster <- factor(ifelse(db_labels == 0, "Noise", paste0("K", db_labels)))

warna_db <- c("Noise" = "gray70",
              setNames(c("#2ecc71","#3498db","#e74c3c","#f39c12",
                         "#9b59b6","#1abc9c","#e67e22","#34495e"),
                       paste0("K", 1:8)))

ggplot(pca_2d, aes(x = PC1, y = PC2, color = Kluster)) +
  geom_point(alpha = 0.4, size = 0.8) +
  scale_color_manual(values = warna_db) +
  labs(
    title    = "Visualisasi DBSCAN Clustering (PCA 2D)",
    subtitle = paste0("eps = ", eps_val, " | minPts = ", minPts_val,
                      " | Kluster = ", max(db_labels),
                      " | Noise = ", sum(db_labels == 0), " titik"),
    color = "Label"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
        plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

10.5 Evaluasi Eksternal DBSCAN

non_noise <- db_labels != 0
purity_db <- purity_score(y_label[non_noise], db_labels[non_noise])

cat("Purity Score DBSCAN (non-noise):", round(purity_db, 4),
    paste0("(", round(purity_db * 100, 2), "%)"), "\n")

## Purity Score DBSCAN (non-noise): 0.2618 (26.18%)

cat("Proporsi noise:", round(mean(db_labels == 0) * 100, 2), "%\n")

## Proporsi noise: 0 %

Tahap 11: Mean Shift Clustering

11.1 Training Mean Shift

set.seed(42)

samp_ms <- sample(nrow(X_final), min(800, nrow(X_final)))
X_ms    <- as.matrix(X_final[samp_ms, ])

ms_result <- ms(X_ms, h = 1.5, scaled = FALSE)

ms_labels_samp <- ms_result$cluster.label
n_kluster_ms   <- length(unique(ms_labels_samp))

cat("Mean Shift selesai!\n\n")

## Mean Shift selesai!

cat("Bandwidth (h)  :", 1.5, "\n")

## Bandwidth (h)  : 1.5

cat("Jumlah kluster :", n_kluster_ms, "\n\n")

## Jumlah kluster : 1

cat("Distribusi label (subsample):\n")

## Distribusi label (subsample):

print(table(ms_labels_samp))

## ms_labels_samp
##   1 
## 800

11.2 Assign Label ke Full Data (nearest mode)

mean_shift <- function(X, h, max_iter = 100, tol = 1e-4) {
  n <- nrow(X)
  modes <- X
  
  for (iter in 1:max_iter) {
    modes_new <- modes
    for (i in 1:n) {
      dists   <- rowSums((X - matrix(modes[i, ], n, ncol(X), byrow = TRUE))^2)
      weights <- exp(-dists / (2 * h^2))
      modes_new[i, ] <- colSums(X * weights) / sum(weights)
    }
    shift <- max(sqrt(rowSums((modes_new - modes)^2)))
    modes <- modes_new
    if (shift < tol) break
  }
  return(modes)
}

assign_clusters <- function(modes, tol = 0.5) {
  n      <- nrow(modes)
  labels <- rep(0, n)
  k      <- 0
  
  for (i in 1:n) {
    if (labels[i] == 0) {
      k <- k + 1
      labels[i] <- k
    }
    if (i < n) {
      for (j in (i+1):n) {
        if (labels[j] == 0) {
          dist_ij <- sqrt(sum((modes[i, ] - modes[j, ])^2))
          if (dist_ij < tol) labels[j] <- labels[i]
        }
      }
    }
  }
  return(labels)
}

assign_to_mode <- function(X, centers) {
  apply(X, 1, function(xi) {
    dists <- apply(centers, 1, function(m) sqrt(sum((xi - m)^2)))
    which.min(dists)
  })
}

cat("Fungsi mean_shift, assign_clusters, assign_to_mode berhasil didefinisikan.\n")

## Fungsi mean_shift, assign_clusters, assign_to_mode berhasil didefinisikan.

set.seed(42)
samp_ms <- sample(nrow(X_final), min(500, nrow(X_final)))
X_ms    <- as.matrix(X_final[samp_ms, ])

cat("Menjalankan Mean Shift dengan h = 0.8...\n")

## Menjalankan Mean Shift dengan h = 0.8...

final_modes    <- mean_shift(X_ms, h = 0.8, max_iter = 50, tol = 1e-3)
ms_labels_samp <- assign_clusters(final_modes, tol = 0.3)  

n_kluster_ms <- length(unique(ms_labels_samp))
cat("Jumlah kluster ditemukan:", n_kluster_ms, "\n")

## Jumlah kluster ditemukan: 3

print(table(ms_labels_samp))

## ms_labels_samp
##   1   2   3 
## 435  41  24

if (n_kluster_ms == 1) {
  cat("\nMasih 1 kluster, mencoba h = 0.5...\n")
  final_modes    <- mean_shift(X_ms, h = 0.5, max_iter = 50, tol = 1e-3)
  ms_labels_samp <- assign_clusters(final_modes, tol = 0.2)
  n_kluster_ms   <- length(unique(ms_labels_samp))
  cat("Jumlah kluster ditemukan:", n_kluster_ms, "\n")
  print(table(ms_labels_samp))
}

mode_centers <- t(sapply(unique(ms_labels_samp), function(k) {
  colMeans(X_ms[ms_labels_samp == k, , drop = FALSE])
}))

ms_labels_full <- assign_to_mode(as.matrix(X_final), mode_centers)
cat("\nDistribusi label full data:\n")

## 
## Distribusi label full data:

print(table(ms_labels_full))

## ms_labels_full
##     1     2     3 
## 10220  2102  1221

11.3 Evaluasi Internal Mean Shift

set.seed(42)
samp_ms2 <- sample(nrow(X_final), min(2000, nrow(X_final)))

n_kluster_full <- length(unique(ms_labels_full))
cat("Jumlah kluster unik di full data:", n_kluster_full, "\n")

## Jumlah kluster unik di full data: 3

if (n_kluster_full >= 2) {
  sil_ms     <- silhouette(ms_labels_full[samp_ms2], dist(X_final[samp_ms2, ]))
  avg_sil_ms <- mean(sil_ms[, 3])
  
  fviz_silhouette(sil_ms,
                  palette       = "jco",
                  ggtheme       = theme_minimal(base_size = 11),
                  print.summary = FALSE) +
    geom_hline(yintercept = avg_sil_ms, linetype = "dashed",
               color = "#e74c3c", linewidth = 0.8) +
    labs(
      title    = "Silhouette Plot — Mean Shift Clustering",
      subtitle = paste0("Rata-rata Silhouette Score = ", round(avg_sil_ms, 4))
    ) +
    theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
          plot.subtitle = element_text(hjust = 0.5, color = "gray50"))
  
} else {
  avg_sil_ms <- NA
  cat("Hanya 1 kluster terbentuk.\n")
}

11.4 Visualisasi Mean Shift (PCA 2D)

pca_2d_ms <- as.data.frame(prcomp(X_final, center = TRUE, scale. = TRUE)$x[, 1:2])
pca_2d_ms$Kluster <- factor(paste0("K", ms_labels_full))

warna_ms <- setNames(
  c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6",
    "#1abc9c","#e67e22","#34495e","#16a085","#8e44ad"),
  paste0("K", 1:10)
)

ggplot(pca_2d_ms, aes(x = PC1, y = PC2, color = Kluster)) +
  geom_point(alpha = 0.4, size = 0.8) +
  scale_color_manual(values = warna_ms) +
  labs(
    title    = "Visualisasi Mean Shift Clustering (PCA 2D)",
    subtitle = paste0("Bandwidth h = 1.5 | ",
                      n_kluster_ms, " kluster ditemukan otomatis"),
    color = "Kluster"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
        plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

11.5 Visualisasi (PCA 2D)

pca_2d_ms <- as.data.frame(prcomp(X_final, center = TRUE, scale. = TRUE)$x[, 1:2])
pca_2d_ms$Kluster <- factor(paste0("K", ms_labels_full))

warna_ms <- setNames(
  c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6",
    "#1abc9c","#e67e22","#34495e","#16a085","#8e44ad"),
  paste0("K", 1:10)
)

ggplot(pca_2d_ms, aes(x = PC1, y = PC2, color = Kluster)) +
  geom_point(alpha = 0.4, size = 0.8) +
  scale_color_manual(values = warna_ms) +
  labs(
    title    = "Visualisasi Mean Shift Clustering (PCA 2D)",
    subtitle = paste0("Bandwidth h = 1.5"),
    color = "Kluster"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
        plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

11.6 Evaluasi Eksternal

purity_ms <- purity_score(y_label, ms_labels_full)
cat("Purity Score Mean Shift:", round(purity_ms, 4),
    paste0("(", round(purity_ms * 100, 2), "%)"), "\n")

## Purity Score Mean Shift: 0.3939 (39.39%)

Tahap 12: Fuzzy C-Means Clustering

##12.1 Training Fuzzy C-means

## 12.1 Training Fuzzy C-Means
library(e1071)

# Definisikan ulang jika session di-restart
k_optimal <- 7

set.seed(42)

fcm_result <- cmeans(
  as.matrix(X_final),
  centers  = k_optimal,
  iter.max = 1000,
  m        = 2,
  dist     = "euclidean",
  method   = "cmeans"
)

fcm_labels     <- fcm_result$cluster
fcm_membership <- fcm_result$membership

cat("Fuzzy C-Means selesai!\n\n")

## Fuzzy C-Means selesai!

cat("Fuzzifier (m)    : 2\n")

## Fuzzifier (m)    : 2

cat("Metrik jarak     : Euclidean\n")

## Metrik jarak     : Euclidean

cat("Jumlah kluster   :", k_optimal, "\n")

## Jumlah kluster   : 7

cat("Withiness (obj)  :", round(fcm_result$withinerror, 4), "\n\n")

## Withiness (obj)  : 0.7123

cat("Distribusi label hard:\n")

## Distribusi label hard:

print(table(fcm_labels))

## fcm_labels
##    1    2    3    4    5    6    7 
## 2702 2747 1208 1489 1824 2307 1266

12.2 Visualisasi Matriks Keanggotaan Fuzzy (200 sampel)

library(tidyverse)

set.seed(42)
samp_fcm <- sample(nrow(fcm_membership), 200)

membership_long <- as.data.frame(fcm_membership[samp_fcm, ]) %>%
  mutate(Observasi = row_number()) %>%
  pivot_longer(-Observasi,
               names_to  = "Kluster",
               values_to = "Keanggotaan") %>%
  mutate(Kluster = paste0("K", Kluster))

ggplot(membership_long,
       aes(x = Observasi, y = Keanggotaan, fill = Kluster)) +
  geom_area(alpha = 0.8, position = "stack") +
  scale_fill_manual(values = c("#2ecc71","#3498db","#e74c3c",
                               "#f39c12","#9b59b6","#1abc9c","#e67e22")) +
  labs(
    title    = "Matriks Keanggotaan Fuzzy C-Means (200 sampel)",
    subtitle = "Setiap observasi memiliki derajat keanggotaan di semua kluster",
    x = "Observasi (sampel)", y = "Derajat Keanggotaan",
    fill = "Kluster"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
        plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

12.3 Evaluasi Internal — Silhouette

set.seed(42)
samp_fcm2   <- sample(nrow(X_final), min(2000, nrow(X_final)))
sil_fcm     <- silhouette(fcm_labels[samp_fcm2], dist(X_final[samp_fcm2, ]))
avg_sil_fcm <- mean(sil_fcm[, 3])

fviz_silhouette(sil_fcm,
                palette       = "jco",
                ggtheme       = theme_minimal(base_size = 11),
                print.summary = FALSE) +
  geom_hline(yintercept = avg_sil_fcm, linetype = "dashed",
             color = "#e74c3c", linewidth = 0.8) +
  labs(
    title    = "Silhouette Plot — Fuzzy C-Means Clustering",
    subtitle = paste0("Rata-rata Silhouette Score = ", round(avg_sil_fcm, 4),
                      " | m = 2 (fuzzifier)")
  ) +
  theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
        plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

12.4 Visualisasi Cluster Plot (PCA 2D)

pca_2d_fcm <- as.data.frame(
  prcomp(X_final, center = TRUE, scale. = TRUE)$x[, 1:2]
)
pca_2d_fcm$Kluster <- factor(paste0("K", fcm_labels))

ggplot(pca_2d_fcm, aes(x = PC1, y = PC2, color = Kluster)) +
  geom_point(alpha = 0.4, size = 0.8) +
  scale_color_manual(values = c("#2ecc71","#3498db","#e74c3c",
                                "#f39c12","#9b59b6","#1abc9c","#e67e22")) +
  stat_ellipse(type = "convex", alpha = 0.12, geom = "polygon",
               aes(fill = Kluster), show.legend = FALSE) +
  scale_fill_manual(values  = c("#2ecc71","#3498db","#e74c3c",
                                "#f39c12","#9b59b6","#1abc9c","#e67e22")) +
  labs(
    title    = "Visualisasi Fuzzy C-Means Clustering (PCA 2D)",
    subtitle = paste0("k = ", k_optimal, " | m = 2 | Label = hard assignment"),
    color = "Kluster"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
        plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type

12.5 Heatmap Centroid Fuzzy C-Means

centroid_fcm <- as.data.frame(fcm_result$centers) %>%
  mutate(Kluster = paste0("K", 1:k_optimal)) %>%
  pivot_longer(-Kluster, names_to = "Fitur", values_to = "Nilai")

ggplot(centroid_fcm, aes(x = Fitur, y = Kluster, fill = Nilai)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = round(Nilai, 2)), size = 3.2, fontface = "bold") +
  scale_fill_gradient2(low = "#e74c3c", mid = "white", high = "#3498db",
                       midpoint = 0, name = "Z-score") +
  labs(
    title    = "Profil Centroid Fuzzy C-Means (Z-score)",
    subtitle = "Centroid fuzzy = rata-rata berbobot keanggotaan | Merah = rendah | Biru = tinggi",
    x = "Fitur", y = "Kluster"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    axis.text.x   = element_text(angle = 35, hjust = 1, face = "bold")
  )

12.6 Evaluasi Eksternal — Purity Score

purity_score <- function(labels_true, labels_pred) {
  tbl <- table(labels_true, labels_pred)
  sum(apply(tbl, 2, max)) / length(labels_true)
}

purity_fcm <- purity_score(y_label, fcm_labels)

table(Kluster = fcm_labels, Kelas_Asli = y_label) %>%
  as.data.frame.matrix() %>%
  rownames_to_column("Kluster") %>%
  mutate(Kluster = paste0("K", Kluster)) %>%
  kable(caption = "Tabel Kontingensi — Fuzzy C-Means vs Kelas Asli") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")

Tabel Kontingensi — Fuzzy C-Means vs Kelas Asli
Kluster	BARBUNYA	BOMBAY	CALI	DERMASON	HOROZ	SEKER	SIRA
K1	35	0	13	1505	16	77	1056
K2	41	0	19	1482	9	114	1082
K3	798	0	70	46	81	18	195
K4	203	147	701	48	278	5	107
K5	184	369	756	0	434	0	81
K6	3	4	2	451	0	1813	34
K7	58	2	69	14	1042	0	81

cat("\nPurity Score Fuzzy C-Means:", round(purity_fcm, 4),
    paste0("(", round(purity_fcm * 100, 2), "%)"), "\n")

## 
## Purity Score Fuzzy C-Means: 0.5979 (59.79%)

12.7 Kesimpulan Fuzzy C-Means

data.frame(
  Aspek  = c("Algoritma", "Pusat Kluster", "Metrik Jarak",
             "Fuzzifier (m)", "Jumlah Kluster",
             "Silhouette Score", "Purity Score", "Status"),
  Hasil  = c("Fuzzy C-Means (cmeans)",
             "Centroid fuzzy (rata-rata berbobot keanggotaan)",
             "Euclidean",
             "2 (standar)",
             paste0(k_optimal, " kluster"),
             round(avg_sil_fcm, 4),
             round(purity_fcm, 4),
             "Selesai — lanjut ke Tahap 13 Perbandingan")
) %>%
  kable(caption = "Kesimpulan Tahap 12: Fuzzy C-Means Clustering") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(8, background = "#eafaf1", color = "#1e8449")

Kesimpulan Tahap 12: Fuzzy C-Means Clustering
Aspek	Hasil
Algoritma	Fuzzy C-Means (cmeans)
Pusat Kluster	Centroid fuzzy (rata-rata berbobot keanggotaan)
Metrik Jarak	Euclidean
Fuzzifier (m)	2 (standar)
Jumlah Kluster	7 kluster
Silhouette Score	0.253
Purity Score	0.5979
Status	Selesai — lanjut ke Tahap 13 Perbandingan

MODUL 3 ANALISIS MULTIVARIAT

Fitri Saidatur Hanifah (035) Dan Darista Wardhani (146)

2026-04-05

Tahap 1: Load Data & Inspeksi Awal

1.1 Load Dataset

1.2 Preview Data

1.3 Tipe Data Tiap Kolom

1.4 Statistik Deskriptif

1.5 Jumlah Nilai Unik Tiap Kolom

1.6 Distribusi Kelas

Tahap 2: Cek & Tangani Missing Values

2.1 Cek Missing Values

2.2 Cek Data Duplikat

2.3 Kesimpulan Kualitas Data

3.2 Deteksi Outlier: Metode IQR (Boxplot Rule)

3.3 Deteksi Outlier: Metode Z-Score

3.4 Ringkasan Level Baris

3.5 Visualisasi Boxplot per Fitur

3.6 Visualisasi: Bar Chart % Outlier per Fitur

3.7 Penanganan Outlier: Winsorizing (Capping IQR)

3.8 Kesimpulan Tahap 3

Tahap 4: Exploratory Data Analysis (EDA)

4.1 Setup: Siapkan Data Numerik Bersih

4.2 Statistik Deskriptif (Setelah Winsorizing)

4.3 Distribusi Tiap Fitur (Histogram)

4.4 Distribusi Fitur per Kelas

4.5 Matriks Korelasi Antar Fitur

4.6 Visualisasi Heatmap Korelasi

4.7 Identifikasi Pasangan Fitur Berkorelasi Tinggi

4.8 Kesimpulan EDA

Tahap 5: Feature Selection / Reduksi Dimensi (Opsional)

5.1 Strategi: Drop Fitur Redundan

5.2 PCA: Analisis Komponen Utama

5.3 Scree Plot PCA

5.4 Biplot PCA

5.5 Keputusan: Pilih Pendekatan

Tahap 6: Pisahkan Kolom Label (Class)

6.1 Pisahkan Fitur dan Label

Tahap 7: Normalisasi / Standarisasi Fitur

7.1 Alasan Normalisasi

7.2 Standardisasi Z-score (StandardScaler)

7.3 Min-Max Normalization (MinMaxScaler)

7.4 Perbandingan Distribusi: Sebelum vs Sesudah Normalisasi

7.5 Pilih Metode Normalisasi untuk Clustering

8.1 Penentuan Jumlah Kluster Optimal

8.1.1 Metode Elbow (WSS)

8.1.2 Metode Silhouette

8.1.3 Keputusan k

8.2 Training K-Means

8.3 Evaluasi Internal K-Means

8.3.1 Silhouette Plot

8.3.2 Tabel Metrik Internal

8.4 Visualisasi Hasil K-Means

8.4.1 Cluster Plot (PCA 2D)

8.4.2 Distribusi Anggota per Kluster

8.4.3 Heatmap Profil Centroid

8.5 Evaluasi Eksternal K-Means

8.6 Kesimpulan K-Means

Tahap 9: K-Median Clustering

9.1 Penentuan Jumlah Kluster Optimal

9.1.1 Metode Elbow (Total Dissimilarity PAM)

9.1.2 Metode Silhouette

9.2 Training K-Median (PAM)

9.3 Evaluasi Internal K-Median

9.3.1 Silhouette Plot

9.3.2 Tabel Metrik Internal

9.4 Visualisasi Hasil K-Median

9.4.1 Cluster Plot (PCA 2D)

9.4.2 Distribusi Anggota per Kluster

9.4.3 Heatmap Profil Medoid

9.5 Evaluasi Eksternal K-Median

9.6 Kesimpulan K-Median

Tahap 10: DBSCAN Clustering

10.1 Penentuan Parameter eps (k-distance plot)

10.2 Training DBSCAN

10.3 Evaluasi Internal DBSCAN

10.4 Visualisasi DBSCAN (PCA 2D)

10.5 Evaluasi Eksternal DBSCAN

Tahap 11: Mean Shift Clustering

11.1 Training Mean Shift