Tahap 1: Load Data & Inspeksi Awal
1.1 Load Dataset
df <- read_excel("Dry_Bean_Dataset.xlsx")
cat("Dataset berhasil dimuat\n")
## Dataset berhasil dimuat
cat("Jumlah baris :", nrow(df), "\n")
## Jumlah baris : 13611
cat("Jumlah kolom :", ncol(df), "\n")
## Jumlah kolom : 17
cat("Total sel :", nrow(df) * ncol(df), "\n")
## Total sel : 231387
1.2 Preview Data
df %>%
head(10) %>%
kable(caption = "10 Baris Pertama Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
font_size = 12)
10 Baris Pertama Dataset
|
Area
|
Perimeter
|
MajorAxisLength
|
MinorAxisLength
|
AspectRation
|
Eccentricity
|
ConvexArea
|
EquivDiameter
|
Extent
|
Solidity
|
roundness
|
Compactness
|
ShapeFactor1
|
ShapeFactor2
|
ShapeFactor3
|
ShapeFactor4
|
Class
|
|
28395
|
610.291
|
208.1781
|
173.8887
|
1.197191
|
0.5498122
|
28715
|
190.1411
|
0.7639225
|
0.9888560
|
0.9580271
|
0.9133578
|
0.0073315
|
0.0031473
|
0.8342224
|
0.9987239
|
SEKER
|
|
28734
|
638.018
|
200.5248
|
182.7344
|
1.097357
|
0.4117853
|
29172
|
191.2728
|
0.7839681
|
0.9849856
|
0.8870336
|
0.9538608
|
0.0069787
|
0.0035636
|
0.9098505
|
0.9984303
|
SEKER
|
|
29380
|
624.110
|
212.8261
|
175.9311
|
1.209713
|
0.5627273
|
29690
|
193.4109
|
0.7781132
|
0.9895588
|
0.9478495
|
0.9087742
|
0.0072439
|
0.0030477
|
0.8258706
|
0.9990661
|
SEKER
|
|
30008
|
645.884
|
210.5580
|
182.5165
|
1.153638
|
0.4986160
|
30724
|
195.4671
|
0.7826813
|
0.9766957
|
0.9039364
|
0.9283288
|
0.0070167
|
0.0032146
|
0.8617944
|
0.9941988
|
SEKER
|
|
30140
|
620.134
|
201.8479
|
190.2793
|
1.060798
|
0.3336797
|
30417
|
195.8965
|
0.7730980
|
0.9908933
|
0.9848771
|
0.9705155
|
0.0066970
|
0.0036650
|
0.9419004
|
0.9991661
|
SEKER
|
|
30279
|
634.927
|
212.5606
|
181.5102
|
1.171067
|
0.5204007
|
30600
|
196.3477
|
0.7756885
|
0.9895098
|
0.9438518
|
0.9237260
|
0.0070201
|
0.0031528
|
0.8532696
|
0.9992358
|
SEKER
|
|
30477
|
670.033
|
211.0502
|
184.0391
|
1.146768
|
0.4894779
|
30970
|
196.9886
|
0.7624015
|
0.9840814
|
0.8530799
|
0.9333736
|
0.0069249
|
0.0032420
|
0.8711862
|
0.9990487
|
SEKER
|
|
30519
|
629.727
|
212.9968
|
182.7372
|
1.165590
|
0.5137596
|
30847
|
197.1243
|
0.7706818
|
0.9893669
|
0.9671092
|
0.9254804
|
0.0069792
|
0.0031583
|
0.8565140
|
0.9983446
|
SEKER
|
|
30685
|
635.681
|
213.5341
|
183.1571
|
1.165852
|
0.5140809
|
31044
|
197.6597
|
0.7715615
|
0.9884358
|
0.9542398
|
0.9256585
|
0.0069589
|
0.0031515
|
0.8568437
|
0.9989530
|
SEKER
|
|
30834
|
631.934
|
217.2278
|
180.8975
|
1.200834
|
0.5536422
|
31120
|
198.1390
|
0.7836828
|
0.9908098
|
0.9702782
|
0.9121254
|
0.0070451
|
0.0030080
|
0.8319728
|
0.9990611
|
SEKER
|
1.3 Tipe Data Tiap Kolom
tipe_data <- data.frame(
Kolom = names(df),
Tipe = sapply(df, class),
row.names = NULL
)
tipe_data %>%
kable(caption = "Tipe Data Tiap Kolom") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)
Tipe Data Tiap Kolom
|
Kolom
|
Tipe
|
|
Area
|
numeric
|
|
Perimeter
|
numeric
|
|
MajorAxisLength
|
numeric
|
|
MinorAxisLength
|
numeric
|
|
AspectRation
|
numeric
|
|
Eccentricity
|
numeric
|
|
ConvexArea
|
numeric
|
|
EquivDiameter
|
numeric
|
|
Extent
|
numeric
|
|
Solidity
|
numeric
|
|
roundness
|
numeric
|
|
Compactness
|
numeric
|
|
ShapeFactor1
|
numeric
|
|
ShapeFactor2
|
numeric
|
|
ShapeFactor3
|
numeric
|
|
ShapeFactor4
|
numeric
|
|
Class
|
character
|
1.4 Statistik Deskriptif
df %>%
select(-Class) %>%
summary() %>%
kable(caption = "Statistik Deskriptif Fitur Numerik") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
font_size = 11)
Statistik Deskriptif Fitur Numerik
|
|
Area
|
Perimeter
|
MajorAxisLength
|
MinorAxisLength
|
AspectRation
|
Eccentricity
|
ConvexArea
|
EquivDiameter
|
Extent
|
Solidity
|
roundness
|
Compactness
|
ShapeFactor1
|
ShapeFactor2
|
ShapeFactor3
|
ShapeFactor4
|
|
|
Min. : 20420
|
Min. : 524.7
|
Min. :183.6
|
Min. :122.5
|
Min. :1.025
|
Min. :0.2190
|
Min. : 20684
|
Min. :161.2
|
Min. :0.5553
|
Min. :0.9192
|
Min. :0.4896
|
Min. :0.6406
|
Min. :0.002778
|
Min. :0.0005642
|
Min. :0.4103
|
Min. :0.9477
|
|
|
1st Qu.: 36328
|
1st Qu.: 703.5
|
1st Qu.:253.3
|
1st Qu.:175.8
|
1st Qu.:1.432
|
1st Qu.:0.7159
|
1st Qu.: 36715
|
1st Qu.:215.1
|
1st Qu.:0.7186
|
1st Qu.:0.9857
|
1st Qu.:0.8321
|
1st Qu.:0.7625
|
1st Qu.:0.005900
|
1st Qu.:0.0011535
|
1st Qu.:0.5814
|
1st Qu.:0.9937
|
|
|
Median : 44652
|
Median : 794.9
|
Median :296.9
|
Median :192.4
|
Median :1.551
|
Median :0.7644
|
Median : 45178
|
Median :238.4
|
Median :0.7599
|
Median :0.9883
|
Median :0.8832
|
Median :0.8013
|
Median :0.006645
|
Median :0.0016935
|
Median :0.6420
|
Median :0.9964
|
|
|
Mean : 53048
|
Mean : 855.3
|
Mean :320.1
|
Mean :202.3
|
Mean :1.583
|
Mean :0.7509
|
Mean : 53768
|
Mean :253.1
|
Mean :0.7497
|
Mean :0.9871
|
Mean :0.8733
|
Mean :0.7999
|
Mean :0.006564
|
Mean :0.0017159
|
Mean :0.6436
|
Mean :0.9951
|
|
|
3rd Qu.: 61332
|
3rd Qu.: 977.2
|
3rd Qu.:376.5
|
3rd Qu.:217.0
|
3rd Qu.:1.707
|
3rd Qu.:0.8105
|
3rd Qu.: 62294
|
3rd Qu.:279.4
|
3rd Qu.:0.7869
|
3rd Qu.:0.9900
|
3rd Qu.:0.9169
|
3rd Qu.:0.8343
|
3rd Qu.:0.007271
|
3rd Qu.:0.0021703
|
3rd Qu.:0.6960
|
3rd Qu.:0.9979
|
|
|
Max. :254616
|
Max. :1985.4
|
Max. :738.9
|
Max. :460.2
|
Max. :2.430
|
Max. :0.9114
|
Max. :263261
|
Max. :569.4
|
Max. :0.8662
|
Max. :0.9947
|
Max. :0.9907
|
Max. :0.9873
|
Max. :0.010451
|
Max. :0.0036650
|
Max. :0.9748
|
Max. :0.9997
|
skim(df)
Data summary
| Name |
df |
| Number of rows |
13611 |
| Number of columns |
17 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| numeric |
16 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
Variable type: numeric
| Area |
0 |
1 |
53048.28 |
29324.10 |
20420.00 |
36328.00 |
44652.00 |
61332.00 |
254616.00 |
▇▂▁▁▁ |
| Perimeter |
0 |
1 |
855.28 |
214.29 |
524.74 |
703.52 |
794.94 |
977.21 |
1985.37 |
▇▆▁▁▁ |
| MajorAxisLength |
0 |
1 |
320.14 |
85.69 |
183.60 |
253.30 |
296.88 |
376.50 |
738.86 |
▇▆▂▁▁ |
| MinorAxisLength |
0 |
1 |
202.27 |
44.97 |
122.51 |
175.85 |
192.43 |
217.03 |
460.20 |
▇▇▁▁▁ |
| AspectRation |
0 |
1 |
1.58 |
0.25 |
1.02 |
1.43 |
1.55 |
1.71 |
2.43 |
▂▇▅▂▁ |
| Eccentricity |
0 |
1 |
0.75 |
0.09 |
0.22 |
0.72 |
0.76 |
0.81 |
0.91 |
▁▁▂▇▇ |
| ConvexArea |
0 |
1 |
53768.20 |
29774.92 |
20684.00 |
36714.50 |
45178.00 |
62294.00 |
263261.00 |
▇▂▁▁▁ |
| EquivDiameter |
0 |
1 |
253.06 |
59.18 |
161.24 |
215.07 |
238.44 |
279.45 |
569.37 |
▇▆▁▁▁ |
| Extent |
0 |
1 |
0.75 |
0.05 |
0.56 |
0.72 |
0.76 |
0.79 |
0.87 |
▁▁▅▇▂ |
| Solidity |
0 |
1 |
0.99 |
0.00 |
0.92 |
0.99 |
0.99 |
0.99 |
0.99 |
▁▁▁▁▇ |
| roundness |
0 |
1 |
0.87 |
0.06 |
0.49 |
0.83 |
0.88 |
0.92 |
0.99 |
▁▁▂▇▇ |
| Compactness |
0 |
1 |
0.80 |
0.06 |
0.64 |
0.76 |
0.80 |
0.83 |
0.99 |
▂▅▇▂▁ |
| ShapeFactor1 |
0 |
1 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.01 |
0.01 |
▁▃▇▃▁ |
| ShapeFactor2 |
0 |
1 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
▇▇▇▃▁ |
| ShapeFactor3 |
0 |
1 |
0.64 |
0.10 |
0.41 |
0.58 |
0.64 |
0.70 |
0.97 |
▂▇▇▃▁ |
| ShapeFactor4 |
0 |
1 |
1.00 |
0.00 |
0.95 |
0.99 |
1.00 |
1.00 |
1.00 |
▁▁▁▁▇ |
1.5 Jumlah Nilai Unik Tiap Kolom
nilai_unik <- data.frame(
Kolom = names(df),
Nilai_Unik = sapply(df, n_distinct),
row.names = NULL
)
nilai_unik %>%
kable(caption = "Jumlah Nilai Unik per Kolom") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)
Jumlah Nilai Unik per Kolom
|
Kolom
|
Nilai_Unik
|
|
Area
|
12011
|
|
Perimeter
|
13416
|
|
MajorAxisLength
|
13543
|
|
MinorAxisLength
|
13543
|
|
AspectRation
|
13543
|
|
Eccentricity
|
13543
|
|
ConvexArea
|
12066
|
|
EquivDiameter
|
12011
|
|
Extent
|
13535
|
|
Solidity
|
13526
|
|
roundness
|
13543
|
|
Compactness
|
13543
|
|
ShapeFactor1
|
13543
|
|
ShapeFactor2
|
13543
|
|
ShapeFactor3
|
13543
|
|
ShapeFactor4
|
13543
|
|
Class
|
7
|
1.6 Distribusi Kelas
dist_kelas <- df %>%
count(Class, name = "Jumlah") %>%
arrange(desc(Jumlah)) %>%
mutate(Persentase = percent(Jumlah / sum(Jumlah), accuracy = 0.1))
dist_kelas %>%
kable(caption = "Distribusi Data per Kelas") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Distribusi Data per Kelas
|
Class
|
Jumlah
|
Persentase
|
|
DERMASON
|
3546
|
26.1%
|
|
SIRA
|
2636
|
19.4%
|
|
SEKER
|
2027
|
14.9%
|
|
HOROZ
|
1928
|
14.2%
|
|
CALI
|
1630
|
12.0%
|
|
BARBUNYA
|
1322
|
9.7%
|
|
BOMBAY
|
522
|
3.8%
|
warna <- c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6","#1abc9c","#e67e22")
# Bar chart
p1 <- ggplot(dist_kelas, aes(x = reorder(Class, -Jumlah),
y = Jumlah, fill = Class)) +
geom_col(width = 0.7, show.legend = FALSE) +
geom_text(aes(label = paste0(Jumlah, "\n", Persentase)),
vjust = -0.3, size = 3.5, fontface = "bold") +
scale_fill_manual(values = warna) +
scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
labs(title = "Distribusi Jumlah Data per Kelas",
x = "Kelas", y = "Jumlah") +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold", hjust = 0.5),
axis.text.x = element_text(face = "bold"))
# Pie chart
p2 <- ggplot(dist_kelas, aes(x = "", y = Jumlah, fill = Class)) +
geom_col(width = 1, color = "white", linewidth = 0.5) +
coord_polar(theta = "y") +
geom_text(aes(label = paste0(Class, "\n", Persentase)),
position = position_stack(vjust = 0.5),
size = 3.5, fontface = "bold", color = "white") +
scale_fill_manual(values = warna) +
labs(title = "Proporsi Kelas") +
theme_void(base_size = 13) +
theme(plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "none")
gridExtra::grid.arrange(p1, p2, ncol = 2)

Tahap 2: Cek & Tangani Missing Values
2.1 Cek Missing Values
missing_df <- data.frame(
Kolom = names(df),
Missing_Count = sapply(df, function(x) sum(is.na(x))),
Missing_Persen = sapply(df, function(x)
percent(mean(is.na(x)), accuracy = 0.01)),
row.names = NULL
)
missing_df %>%
kable(caption = "Ringkasan Missing Values per Kolom") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which(missing_df$Missing_Count > 0),
background = "#fde8e8", color = "#c0392b")
Ringkasan Missing Values per Kolom
|
Kolom
|
Missing_Count
|
Missing_Persen
|
|
Area
|
0
|
0.00%
|
|
Perimeter
|
0
|
0.00%
|
|
MajorAxisLength
|
0
|
0.00%
|
|
MinorAxisLength
|
0
|
0.00%
|
|
AspectRation
|
0
|
0.00%
|
|
Eccentricity
|
0
|
0.00%
|
|
ConvexArea
|
0
|
0.00%
|
|
EquivDiameter
|
0
|
0.00%
|
|
Extent
|
0
|
0.00%
|
|
Solidity
|
0
|
0.00%
|
|
roundness
|
0
|
0.00%
|
|
Compactness
|
0
|
0.00%
|
|
ShapeFactor1
|
0
|
0.00%
|
|
ShapeFactor2
|
0
|
0.00%
|
|
ShapeFactor3
|
0
|
0.00%
|
|
ShapeFactor4
|
0
|
0.00%
|
|
Class
|
0
|
0.00%
|
cat("\nTotal missing values:", sum(is.na(df)), "\n")
##
## Total missing values: 0
2.2 Cek Data Duplikat
n_duplikat <- sum(duplicated(df))
cat("Jumlah baris duplikat:", n_duplikat, "\n")
## Jumlah baris duplikat: 68
if (n_duplikat > 0) {
df <- df %>% distinct()
cat("Duplikat dihapus. Shape baru:", nrow(df), "x", ncol(df), "\n")
} else {
cat("Tidak ada duplikat\n")
}
## Duplikat dihapus. Shape baru: 13543 x 17
2.3 Kesimpulan Kualitas Data
kesimpulan <- data.frame(
Aspek = c("Shape awal", "Total missing values",
"Data duplikat", "Jumlah kelas", "Fitur numerik"),
Hasil = c(
paste0(nrow(df), " baris × ", ncol(df), " kolom"),
paste0(sum(is.na(df)), " (tidak ada)"),
paste0(n_duplikat, " (tidak ada)"),
paste0(n_distinct(df$Class), " kelas"),
paste0(ncol(select(df, -Class)), " kolom")
),
Status = c("oke", "oke", "oke", "oke", "oke")
)
kesimpulan %>%
kable(caption = "Kesimpulan Kualitas Data") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
column_spec(3, color = "green", bold = TRUE)
Kesimpulan Kualitas Data
|
Aspek
|
Hasil
|
Status
|
|
Shape awal
|
13543 baris × 17 kolom
|
oke
|
|
Total missing values
|
0 (tidak ada)
|
oke
|
|
Data duplikat
|
68 (tidak ada)
|
oke
|
|
Jumlah kelas
|
7 kelas
|
oke
|
|
Fitur numerik
|
16 kolom
|
oke
|
#3. Deteksi & Tangani Outlier ##3.1 Setup: Pisahkan Fitur
Numerik
df_numerik <- df %>% select(-Class)
fitur_numerik <- names(df_numerik)
cat("Jumlah fitur numerik:", length(fitur_numerik), "\n")
## Jumlah fitur numerik: 16
cat("Fitur:", paste(fitur_numerik, collapse = ", "), "\n")
## Fitur: Area, Perimeter, MajorAxisLength, MinorAxisLength, AspectRation, Eccentricity, ConvexArea, EquivDiameter, Extent, Solidity, roundness, Compactness, ShapeFactor1, ShapeFactor2, ShapeFactor3, ShapeFactor4
3.2 Deteksi Outlier: Metode IQR (Boxplot Rule)
iqr_summary <- df_numerik %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
group_by(Fitur) %>%
summarise(
Q1 = quantile(Nilai, 0.25),
Q3 = quantile(Nilai, 0.75),
IQR = Q3 - Q1,
Lower_Bound = Q1 - 1.5 * IQR,
Upper_Bound = Q3 + 1.5 * IQR,
N_Outlier = sum(Nilai < (Q1 - 1.5 * IQR) | Nilai > (Q3 + 1.5 * IQR)),
Pct_Outlier = round(N_Outlier / n() * 100, 2),
.groups = "drop"
) %>%
arrange(desc(N_Outlier))
iqr_summary %>%
kable(
caption = "Deteksi Outlier per Fitur (IQR)",
digits = 4,
col.names = c("Fitur", "Q1", "Q3", "IQR",
"Batas Bawah", "Batas Atas",
"N Outlier", "% Outlier")
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
font_size = 12
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(
which(iqr_summary$Pct_Outlier >= 5),
background = "#fde8e8", color = "#c0392b"
)
Deteksi Outlier per Fitur (IQR)
|
Fitur
|
Q1
|
Q3
|
IQR
|
Batas Bawah
|
Batas Atas
|
N Outlier
|
% Outlier
|
|
Eccentricity
|
0.7151
|
0.8097
|
0.0945
|
0.5734
|
0.9515
|
833
|
6.15
|
|
Solidity
|
0.9857
|
0.9900
|
0.0043
|
0.9792
|
0.9965
|
774
|
5.72
|
|
ShapeFactor4
|
0.9937
|
0.9979
|
0.0042
|
0.9875
|
1.0041
|
760
|
5.61
|
|
MinorAxisLength
|
175.8864
|
217.2454
|
41.3590
|
113.8478
|
279.2840
|
567
|
4.19
|
|
Area
|
36282.5000
|
61382.0000
|
25099.5000
|
-1366.7500
|
99031.2500
|
551
|
4.07
|
|
ConvexArea
|
36673.0000
|
62360.0000
|
25687.0000
|
-1857.5000
|
100890.5000
|
549
|
4.05
|
|
ShapeFactor1
|
0.0059
|
0.0073
|
0.0014
|
0.0038
|
0.0093
|
533
|
3.94
|
|
EquivDiameter
|
214.9333
|
279.5604
|
64.6271
|
117.9927
|
376.5010
|
526
|
3.88
|
|
Perimeter
|
703.2300
|
977.1465
|
273.9165
|
292.3553
|
1388.0212
|
500
|
3.69
|
|
AspectRation
|
1.4307
|
1.7039
|
0.2733
|
1.0208
|
2.1138
|
485
|
3.58
|
|
MajorAxisLength
|
253.0868
|
376.3125
|
123.2257
|
68.2483
|
561.1510
|
379
|
2.80
|
|
Extent
|
0.7187
|
0.7868
|
0.0681
|
0.6166
|
0.8890
|
271
|
2.00
|
|
ShapeFactor3
|
0.5825
|
0.6963
|
0.1138
|
0.4118
|
0.8671
|
202
|
1.49
|
|
Compactness
|
0.7632
|
0.8345
|
0.0712
|
0.6564
|
0.9413
|
124
|
0.92
|
|
roundness
|
0.8334
|
0.9170
|
0.0836
|
0.7080
|
1.0425
|
98
|
0.72
|
|
ShapeFactor2
|
0.0012
|
0.0022
|
0.0010
|
-0.0004
|
0.0037
|
0
|
0.00
|
3.3 Deteksi Outlier: Metode Z-Score
zscore_summary <- df_numerik %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
group_by(Fitur) %>%
summarise(
N_Outlier_Z = sum(abs(scale(Nilai)) > 3),
Pct_Outlier_Z = round(N_Outlier_Z / n() * 100, 2),
.groups = "drop"
) %>%
arrange(desc(N_Outlier_Z))
zscore_summary %>%
kable(
caption = "Deteksi Outlier per Fitur (Z-score)",
col.names = c("Fitur", "N Outlier", "% Outlier")
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
font_size = 12
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Deteksi Outlier per Fitur (Z-score)
|
Fitur
|
N Outlier
|
% Outlier
|
|
MinorAxisLength
|
506
|
3.74
|
|
Area
|
483
|
3.57
|
|
ConvexArea
|
482
|
3.56
|
|
EquivDiameter
|
465
|
3.43
|
|
Perimeter
|
404
|
2.98
|
|
MajorAxisLength
|
316
|
2.33
|
|
ShapeFactor4
|
239
|
1.76
|
|
Solidity
|
237
|
1.75
|
|
Extent
|
139
|
1.03
|
|
Eccentricity
|
125
|
0.92
|
|
roundness
|
75
|
0.55
|
|
ShapeFactor1
|
53
|
0.39
|
|
AspectRation
|
21
|
0.16
|
|
ShapeFactor3
|
8
|
0.06
|
|
ShapeFactor2
|
5
|
0.04
|
|
Compactness
|
1
|
0.01
|
3.4 Ringkasan Level Baris
outlier_mask <- df_numerik
for (col in fitur_numerik) {
q1 <- quantile(df_numerik[[col]], 0.25)
q3 <- quantile(df_numerik[[col]], 0.75)
iqr <- q3 - q1
outlier_mask[[col]] <- df_numerik[[col]] < (q1 - 1.5 * iqr) |
df_numerik[[col]] > (q3 + 1.5 * iqr)
}
baris_outlier <- rowSums(outlier_mask) > 0
n_baris_outlier <- sum(baris_outlier)
n_baris_bersih <- nrow(df) - n_baris_outlier
ringkasan_baris <- data.frame(
Kondisi = c("Baris dengan outlier (≥1 fitur)", "Baris bersih", "Total"),
Jumlah = c(n_baris_outlier, n_baris_bersih, nrow(df)),
Persentase = percent(
c(n_baris_outlier, n_baris_bersih, nrow(df)) / nrow(df),
accuracy = 0.01
)
)
ringkasan_baris %>%
kable(caption = "Ringkasan Outlier Level Baris (IQR)") %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(1, background = "#fde8e8", color = "#c0392b") %>%
row_spec(2, background = "#eafaf1", color = "#1e8449")
Ringkasan Outlier Level Baris (IQR)
|
Kondisi
|
Jumlah
|
Persentase
|
|
Baris dengan outlier (≥1 fitur)
|
3004
|
22.18%
|
|
Baris bersih
|
10539
|
77.82%
|
|
Total
|
13543
|
100.00%
|
3.5 Visualisasi Boxplot per Fitur
warna_box <- c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6",
"#1abc9c","#e67e22","#34495e","#16a085","#8e44ad",
"#2980b9","#c0392b","#27ae60","#d35400","#7f8c8d","#2c3e50")
df_long <- df_numerik %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
mutate(Fitur = factor(Fitur, levels = fitur_numerik))
ggplot(df_long, aes(x = Fitur, y = Nilai, fill = Fitur)) +
geom_boxplot(
outlier.shape = 16,
outlier.size = 0.8,
outlier.alpha = 0.4,
outlier.colour = "#e74c3c",
show.legend = FALSE
) +
scale_fill_manual(values = warna_box) +
facet_wrap(~Fitur, scales = "free", ncol = 4) +
labs(
title = "Boxplot Tiap Fitur",
subtitle = "Skala bebas per fitur (free scale)",
x = NULL, y = "Nilai"
) +
theme_minimal(base_size = 11) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
strip.text = element_text(face = "bold", size = 10)
)

3.6 Visualisasi: Bar Chart % Outlier per Fitur
iqr_plot <- iqr_summary %>%
mutate(
Fitur = reorder(Fitur, Pct_Outlier),
Warna = ifelse(Pct_Outlier >= 5, "#e74c3c", "#3498db")
)
ggplot(iqr_plot, aes(x = Fitur, y = Pct_Outlier, fill = Warna)) +
geom_col(width = 0.7, show.legend = FALSE) +
geom_hline(yintercept = 5, linetype = "dashed",
color = "#f39c12", linewidth = 1) +
geom_text(aes(label = paste0(Pct_Outlier, "%")),
hjust = -0.15, size = 3.5, fontface = "bold") +
scale_fill_identity() +
scale_y_continuous(
expand = expansion(mult = c(0, 0.2)),
labels = function(x) paste0(x, "%")
) +
coord_flip() +
annotate("text", x = 1, y = 5.3,
label = "Batas 5%", color = "#f39c12",
size = 3.5, fontface = "italic") +
labs(
title = "Persentase Outlier per Fitur (Metode IQR)",
subtitle = "Merah = outlier ≥ 5% | Biru = outlier < 5%",
x = "Fitur", y = "% Outlier"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
panel.grid.major.y = element_blank()
)

3.7 Penanganan Outlier: Winsorizing (Capping IQR)
df_clean <- df
for (col in fitur_numerik) {
q1 <- quantile(df_clean[[col]], 0.25)
q3 <- quantile(df_clean[[col]], 0.75)
iqr <- q3 - q1
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
df_clean[[col]] <- pmin(pmax(df_clean[[col]], lower), upper)
}
cat("Winsorizing selesai!\n")
## Winsorizing selesai!
cat("Shape df_clean:", nrow(df_clean), "x", ncol(df_clean), "\n")
## Shape df_clean: 13543 x 17
cat("Kolom Class tetap ada:", "Class" %in% names(df_clean), "\n")
## Kolom Class tetap ada: TRUE
outlier_setelah <- df_clean %>%
select(-Class) %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
group_by(Fitur) %>%
summarise(
N_Outlier = sum(Nilai < (quantile(Nilai, 0.25) - 1.5 * IQR(Nilai)) |
Nilai > (quantile(Nilai, 0.75) + 1.5 * IQR(Nilai))),
Pct_Outlier = round(N_Outlier / n() * 100, 2),
.groups = "drop"
)
outlier_setelah %>%
kable(
caption = "Verifikasi Outlier Setelah Winsorizing",
col.names = c("Fitur", "N Outlier", "% Outlier")
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
font_size = 12
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(
which(outlier_setelah$N_Outlier == 0),
background = "#eafaf1", color = "#1e8449"
)
Verifikasi Outlier Setelah Winsorizing
|
Fitur
|
N Outlier
|
% Outlier
|
|
Area
|
0
|
0
|
|
AspectRation
|
0
|
0
|
|
Compactness
|
0
|
0
|
|
ConvexArea
|
0
|
0
|
|
Eccentricity
|
0
|
0
|
|
EquivDiameter
|
0
|
0
|
|
Extent
|
0
|
0
|
|
MajorAxisLength
|
0
|
0
|
|
MinorAxisLength
|
0
|
0
|
|
Perimeter
|
0
|
0
|
|
ShapeFactor1
|
0
|
0
|
|
ShapeFactor2
|
0
|
0
|
|
ShapeFactor3
|
0
|
0
|
|
ShapeFactor4
|
0
|
0
|
|
Solidity
|
0
|
0
|
|
roundness
|
0
|
0
|
3.8 Kesimpulan Tahap 3
fitur_tinggi <- iqr_summary %>%
filter(Pct_Outlier >= 5) %>%
pull(Fitur)
kesimpulan_3 <- data.frame(
Aspek = c(
"Metode deteksi",
"Fitur dengan outlier tertinggi",
"Baris terdeteksi outlier (IQR)",
"Metode penanganan",
"Shape data setelah penanganan",
"Status"
),
Hasil = c(
"IQR (1.5×IQR) & Z-Score (|z| > 3)",
paste(fitur_tinggi, collapse = ", "),
paste0(n_baris_outlier, " baris (", round(n_baris_outlier/nrow(df)*100,2), "%)"),
"Winsorizing / Capping ke batas IQR",
paste0(nrow(df_clean), " baris × ", ncol(df_clean), " kolom"),
"Selesai — lanjut ke Tahap 4 EDA"
),
Status = c("info", "perlu cek", "perlu cek", "wajib", "oke", "oke")
)
kesimpulan_3 %>%
kable(caption = "Kesimpulan Tahap 3: Deteksi & Penanganan Outlier") %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(which(kesimpulan_3$Status == "oke"),
background = "#eafaf1", color = "#1e8449") %>%
row_spec(which(kesimpulan_3$Status == "perlu cek"),
background = "#fef9e7", color = "#b7770d") %>%
column_spec(3, bold = TRUE)
Kesimpulan Tahap 3: Deteksi & Penanganan Outlier
|
Aspek
|
Hasil
|
Status
|
|
Metode deteksi
|
IQR (1.5×IQR) & Z-Score (|z| > 3)
|
info
|
|
Fitur dengan outlier tertinggi
|
Eccentricity, Solidity, ShapeFactor4
|
perlu cek
|
|
Baris terdeteksi outlier (IQR)
|
3004 baris (22.18%)
|
perlu cek
|
|
Metode penanganan
|
Winsorizing / Capping ke batas IQR
|
wajib
|
|
Shape data setelah penanganan
|
13543 baris × 17 kolom
|
oke
|
|
Status
|
Selesai — lanjut ke Tahap 4 EDA
|
oke
|
Tahap 4: Exploratory Data Analysis (EDA)
4.1 Setup: Siapkan Data Numerik Bersih
df_numerik_clean <- df_clean %>% select(-Class)
fitur_numerik <- names(df_numerik_clean)
cat("Jumlah fitur numerik:", length(fitur_numerik), "\n")
## Jumlah fitur numerik: 16
cat("Fitur:", paste(fitur_numerik, collapse = ", "), "\n")
## Fitur: Area, Perimeter, MajorAxisLength, MinorAxisLength, AspectRation, Eccentricity, ConvexArea, EquivDiameter, Extent, Solidity, roundness, Compactness, ShapeFactor1, ShapeFactor2, ShapeFactor3, ShapeFactor4
4.2 Statistik Deskriptif (Setelah Winsorizing)
stat_desc <- df_numerik_clean %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
group_by(Fitur) %>%
summarise(
Min = round(min(Nilai), 2),
Q1 = round(quantile(Nilai, 0.25), 2),
Median = round(median(Nilai), 2),
Mean = round(mean(Nilai), 2),
Q3 = round(quantile(Nilai, 0.75), 2),
Max = round(max(Nilai), 2),
SD = round(sd(Nilai), 2),
.groups = "drop"
)
stat_desc %>%
kable(caption = "Statistik Deskriptif Fitur Numerik (Setelah Winsorizing)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE, font_size = 12) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Statistik Deskriptif Fitur Numerik (Setelah Winsorizing)
|
Fitur
|
Min
|
Q1
|
Median
|
Mean
|
Q3
|
Max
|
SD
|
|
Area
|
20420.00
|
36282.50
|
44580.00
|
50166.95
|
61382.00
|
99031.25
|
18864.23
|
|
AspectRation
|
1.02
|
1.43
|
1.55
|
1.58
|
1.70
|
2.11
|
0.24
|
|
Compactness
|
0.66
|
0.76
|
0.80
|
0.80
|
0.83
|
0.94
|
0.06
|
|
ConvexArea
|
20684.00
|
36673.00
|
45122.00
|
50869.02
|
62360.00
|
100890.50
|
19253.93
|
|
Eccentricity
|
0.57
|
0.72
|
0.76
|
0.75
|
0.81
|
0.91
|
0.08
|
|
EquivDiameter
|
161.24
|
214.93
|
238.25
|
249.47
|
279.56
|
376.50
|
47.51
|
|
Extent
|
0.62
|
0.72
|
0.76
|
0.75
|
0.79
|
0.87
|
0.05
|
|
MajorAxisLength
|
183.60
|
253.09
|
296.40
|
318.31
|
376.31
|
561.15
|
80.41
|
|
MinorAxisLength
|
122.51
|
175.89
|
192.49
|
198.67
|
217.25
|
279.28
|
32.83
|
|
Perimeter
|
524.74
|
703.23
|
793.90
|
847.33
|
977.15
|
1388.02
|
189.36
|
|
ShapeFactor1
|
0.00
|
0.01
|
0.01
|
0.01
|
0.01
|
0.01
|
0.00
|
|
ShapeFactor2
|
0.00
|
0.00
|
0.00
|
0.00
|
0.00
|
0.00
|
0.00
|
|
ShapeFactor3
|
0.41
|
0.58
|
0.64
|
0.64
|
0.70
|
0.87
|
0.10
|
|
ShapeFactor4
|
0.99
|
0.99
|
1.00
|
1.00
|
1.00
|
1.00
|
0.00
|
|
Solidity
|
0.98
|
0.99
|
0.99
|
0.99
|
0.99
|
0.99
|
0.00
|
|
roundness
|
0.71
|
0.83
|
0.88
|
0.87
|
0.92
|
0.99
|
0.06
|
4.3 Distribusi Tiap Fitur (Histogram)
warna_hist <- c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6",
"#1abc9c","#e67e22","#34495e","#16a085","#8e44ad",
"#2980b9","#c0392b","#27ae60","#d35400","#7f8c8d","#2c3e50")
df_long_clean <- df_numerik_clean %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
mutate(Fitur = factor(Fitur, levels = fitur_numerik))
ggplot(df_long_clean, aes(x = Nilai, fill = Fitur)) +
geom_histogram(bins = 40, color = "white", alpha = 0.85, show.legend = FALSE) +
geom_freqpoly(bins = 40, color = "black", linewidth = 0.5, show.legend = FALSE) +
scale_fill_manual(values = warna_hist) +
facet_wrap(~Fitur, scales = "free", ncol = 4) +
labs(
title = "Distribusi Tiap Fitur Numerik (Setelah Winsorizing)",
subtitle = "Skala bebas per fitur",
x = "Nilai", y = "Frekuensi"
) +
theme_minimal(base_size = 11) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
strip.text = element_text(face = "bold", size = 10)
)

4.4 Distribusi Fitur per Kelas
df_long_kelas <- df_clean %>%
pivot_longer(-Class, names_to = "Fitur", values_to = "Nilai") %>%
mutate(Fitur = factor(Fitur, levels = fitur_numerik))
warna_kelas <- c("#2ecc71","#3498db","#e74c3c","#f39c12",
"#9b59b6","#1abc9c","#e67e22")
ggplot(df_long_kelas, aes(x = Class, y = Nilai, fill = Class)) +
geom_violin(trim = TRUE, alpha = 0.7, show.legend = FALSE) +
geom_boxplot(width = 0.15, fill = "white",
outlier.shape = NA, show.legend = FALSE) +
scale_fill_manual(values = warna_kelas) +
facet_wrap(~Fitur, scales = "free_y", ncol = 4) +
labs(
title = "Distribusi Fitur per Kelas (Violin dan Boxplot)",
subtitle = "Distribusi nilai fitur untuk tiap kelas kacang",
x = "Kelas", y = "Nilai"
) +
theme_minimal(base_size = 10) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
strip.text = element_text(face = "bold", size = 9),
axis.text.x = element_text(angle = 30, hjust = 1, size = 8)
)

4.5 Matriks Korelasi Antar Fitur
cor_matrix <- cor(df_numerik_clean, method = "pearson")
cor_matrix %>%
round(2) %>%
kable(caption = "Matriks Korelasi Pearson Antar Fitur") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = TRUE, font_size = 10) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Matriks Korelasi Pearson Antar Fitur
|
|
Area
|
Perimeter
|
MajorAxisLength
|
MinorAxisLength
|
AspectRation
|
Eccentricity
|
ConvexArea
|
EquivDiameter
|
Extent
|
Solidity
|
roundness
|
Compactness
|
ShapeFactor1
|
ShapeFactor2
|
ShapeFactor3
|
ShapeFactor4
|
|
Area
|
1.00
|
0.99
|
0.95
|
0.92
|
0.37
|
0.39
|
1.00
|
1.00
|
0.00
|
-0.32
|
-0.54
|
-0.39
|
-0.89
|
-0.77
|
-0.39
|
-0.50
|
|
Perimeter
|
0.99
|
1.00
|
0.97
|
0.88
|
0.44
|
0.44
|
0.99
|
0.99
|
-0.04
|
-0.37
|
-0.62
|
-0.45
|
-0.86
|
-0.81
|
-0.45
|
-0.52
|
|
MajorAxisLength
|
0.95
|
0.97
|
1.00
|
0.77
|
0.59
|
0.58
|
0.95
|
0.96
|
-0.09
|
-0.33
|
-0.64
|
-0.60
|
-0.76
|
-0.88
|
-0.60
|
-0.57
|
|
MinorAxisLength
|
0.92
|
0.88
|
0.77
|
1.00
|
0.00
|
0.02
|
0.92
|
0.92
|
0.14
|
-0.23
|
-0.28
|
-0.01
|
-0.99
|
-0.49
|
-0.02
|
-0.33
|
|
AspectRation
|
0.37
|
0.44
|
0.59
|
0.00
|
1.00
|
0.96
|
0.37
|
0.38
|
-0.36
|
-0.30
|
-0.78
|
-0.99
|
0.01
|
-0.84
|
-0.98
|
-0.52
|
|
Eccentricity
|
0.39
|
0.44
|
0.58
|
0.02
|
0.96
|
1.00
|
0.39
|
0.39
|
-0.33
|
-0.34
|
-0.75
|
-0.98
|
0.01
|
-0.87
|
-0.99
|
-0.54
|
|
ConvexArea
|
1.00
|
0.99
|
0.95
|
0.92
|
0.37
|
0.39
|
1.00
|
1.00
|
0.00
|
-0.33
|
-0.54
|
-0.39
|
-0.89
|
-0.77
|
-0.40
|
-0.51
|
|
EquivDiameter
|
1.00
|
0.99
|
0.96
|
0.92
|
0.38
|
0.39
|
1.00
|
1.00
|
0.00
|
-0.31
|
-0.54
|
-0.40
|
-0.90
|
-0.78
|
-0.40
|
-0.50
|
|
Extent
|
0.00
|
-0.04
|
-0.09
|
0.14
|
-0.36
|
-0.33
|
0.00
|
0.00
|
1.00
|
0.21
|
0.34
|
0.35
|
-0.14
|
0.23
|
0.34
|
0.15
|
|
Solidity
|
-0.32
|
-0.37
|
-0.33
|
-0.23
|
-0.30
|
-0.34
|
-0.33
|
-0.31
|
0.21
|
1.00
|
0.65
|
0.33
|
0.18
|
0.38
|
0.34
|
0.60
|
|
roundness
|
-0.54
|
-0.62
|
-0.64
|
-0.28
|
-0.78
|
-0.75
|
-0.54
|
-0.54
|
0.34
|
0.65
|
1.00
|
0.78
|
0.25
|
0.79
|
0.77
|
0.53
|
|
Compactness
|
-0.39
|
-0.45
|
-0.60
|
-0.01
|
-0.99
|
-0.98
|
-0.39
|
-0.40
|
0.35
|
0.33
|
0.78
|
1.00
|
-0.01
|
0.87
|
1.00
|
0.55
|
|
ShapeFactor1
|
-0.89
|
-0.86
|
-0.76
|
-0.99
|
0.01
|
0.01
|
-0.89
|
-0.90
|
-0.14
|
0.18
|
0.25
|
-0.01
|
1.00
|
0.47
|
-0.01
|
0.30
|
|
ShapeFactor2
|
-0.77
|
-0.81
|
-0.88
|
-0.49
|
-0.84
|
-0.87
|
-0.77
|
-0.78
|
0.23
|
0.38
|
0.79
|
0.87
|
0.47
|
1.00
|
0.87
|
0.61
|
|
ShapeFactor3
|
-0.39
|
-0.45
|
-0.60
|
-0.02
|
-0.98
|
-0.99
|
-0.40
|
-0.40
|
0.34
|
0.34
|
0.77
|
1.00
|
-0.01
|
0.87
|
1.00
|
0.55
|
|
ShapeFactor4
|
-0.50
|
-0.52
|
-0.57
|
-0.33
|
-0.52
|
-0.54
|
-0.51
|
-0.50
|
0.15
|
0.60
|
0.53
|
0.55
|
0.30
|
0.61
|
0.55
|
1.00
|
4.6 Visualisasi Heatmap Korelasi
ggcorrplot(
cor_matrix,
method = "square",
type = "lower",
lab = TRUE,
lab_size = 3,
colors = c("#e74c3c", "white", "#2ecc71"),
title = "Heatmap Korelasi Antar Fitur (Pearson)",
ggtheme = theme_minimal(base_size = 11),
tl.cex = 10,
outline.col = "white"
) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
legend.position = "right"
)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
## Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

4.7 Identifikasi Pasangan Fitur Berkorelasi Tinggi
cor_df <- as.data.frame(as.table(cor_matrix)) %>%
rename(Fitur_1 = Var1, Fitur_2 = Var2, Korelasi = Freq) %>%
filter(as.character(Fitur_1) < as.character(Fitur_2)) %>%
mutate(Korelasi = round(Korelasi, 4),
Abs_Kor = abs(Korelasi)) %>%
arrange(desc(Abs_Kor))
korelasi_tinggi <- cor_df %>% filter(Abs_Kor >= 0.8)
korelasi_tinggi %>%
select(Fitur_1, Fitur_2, Korelasi) %>%
kable(caption = "Pasangan Fitur dengan Korelasi Tinggi (|r| ≥ 0.80)") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE, font_size = 12) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(which(abs(korelasi_tinggi$Korelasi) >= 0.95),
background = "#fde8e8", color = "#c0392b") %>%
row_spec(which(abs(korelasi_tinggi$Korelasi) >= 0.80 &
abs(korelasi_tinggi$Korelasi) < 0.95),
background = "#fef9e7", color = "#b7770d")
Pasangan Fitur dengan Korelasi Tinggi (|r| ≥ 0.80)
|
Fitur_1
|
Fitur_2
|
Korelasi
|
|
Area
|
ConvexArea
|
0.9999
|
|
Compactness
|
ShapeFactor3
|
0.9989
|
|
Area
|
EquivDiameter
|
0.9972
|
|
ConvexArea
|
EquivDiameter
|
0.9970
|
|
AspectRation
|
Compactness
|
-0.9909
|
|
Eccentricity
|
ShapeFactor3
|
-0.9907
|
|
EquivDiameter
|
Perimeter
|
0.9905
|
|
MinorAxisLength
|
ShapeFactor1
|
-0.9877
|
|
ConvexArea
|
Perimeter
|
0.9874
|
|
Area
|
Perimeter
|
0.9866
|
|
AspectRation
|
ShapeFactor3
|
-0.9842
|
|
Compactness
|
Eccentricity
|
-0.9840
|
|
MajorAxisLength
|
Perimeter
|
0.9746
|
|
EquivDiameter
|
MajorAxisLength
|
0.9589
|
|
AspectRation
|
Eccentricity
|
0.9560
|
|
ConvexArea
|
MajorAxisLength
|
0.9529
|
|
Area
|
MajorAxisLength
|
0.9526
|
|
EquivDiameter
|
MinorAxisLength
|
0.9175
|
|
Area
|
MinorAxisLength
|
0.9172
|
|
ConvexArea
|
MinorAxisLength
|
0.9168
|
|
EquivDiameter
|
ShapeFactor1
|
-0.9013
|
|
Area
|
ShapeFactor1
|
-0.8913
|
|
ConvexArea
|
ShapeFactor1
|
-0.8904
|
|
MajorAxisLength
|
ShapeFactor2
|
-0.8844
|
|
MinorAxisLength
|
Perimeter
|
0.8784
|
|
ShapeFactor2
|
ShapeFactor3
|
0.8715
|
|
Compactness
|
ShapeFactor2
|
0.8679
|
|
Eccentricity
|
ShapeFactor2
|
-0.8662
|
|
Perimeter
|
ShapeFactor1
|
-0.8602
|
|
AspectRation
|
ShapeFactor2
|
-0.8444
|
|
Perimeter
|
ShapeFactor2
|
-0.8079
|
cat("\nTotal pasangan berkorelasi tinggi (|r| ≥ 0.80):", nrow(korelasi_tinggi), "\n")
##
## Total pasangan berkorelasi tinggi (|r| ≥ 0.80): 31
4.8 Kesimpulan EDA
n_kor_tinggi <- nrow(korelasi_tinggi)
n_kor_sangat_tinggi <- sum(abs(korelasi_tinggi$Korelasi) >= 0.95)
kesimpulan_4 <- data.frame(
Aspek = c(
"Jumlah fitur yang dianalisis",
"Pasangan korelasi tinggi (|r| ≥ 0.80)",
"Pasangan korelasi sangat tinggi (|r| ≥ 0.95)",
"Fitur terkorelasi tertinggi",
"Rekomendasi"
),
Hasil = c(
paste0(length(fitur_numerik), " fitur numerik"),
paste0(n_kor_tinggi, " pasangan"),
paste0(n_kor_sangat_tinggi, " pasangan"),
paste0(cor_df$Fitur_1[1], " — ", cor_df$Fitur_2[1],
" (r = ", cor_df$Korelasi[1], ")"),
"Pertimbangkan PCA atau drop fitur redundan"
),
Status = c("oke", "perlu cek", "wajib", "info", "wajib")
)
kesimpulan_4 %>%
kable(caption = "Kesimpulan Tahap 4: EDA") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(which(kesimpulan_4$Status == "oke"),
background = "#eafaf1", color = "#1e8449") %>%
row_spec(which(kesimpulan_4$Status == "perlu cek"),
background = "#fef9e7", color = "#b7770d") %>%
row_spec(which(kesimpulan_4$Status == "wajib"),
background = "#fde8e8", color = "#c0392b") %>%
column_spec(3, bold = TRUE)
Kesimpulan Tahap 4: EDA
|
Aspek
|
Hasil
|
Status
|
|
Jumlah fitur yang dianalisis
|
16 fitur numerik
|
oke
|
|
Pasangan korelasi tinggi (|r| ≥ 0.80)
|
31 pasangan
|
perlu cek
|
|
Pasangan korelasi sangat tinggi (|r| ≥ 0.95)
|
17 pasangan
|
wajib
|
|
Fitur terkorelasi tertinggi
|
Area — ConvexArea (r = 0.9999)
|
info
|
|
Rekomendasi
|
Pertimbangkan PCA atau drop fitur redundan
|
wajib
|
Tahap 5: Feature Selection / Reduksi Dimensi (Opsional)
5.1 Strategi: Drop Fitur Redundan
fitur_drop <- c()
for (i in seq_len(nrow(korelasi_tinggi))) {
f1 <- as.character(korelasi_tinggi$Fitur_1[i])
f2 <- as.character(korelasi_tinggi$Fitur_2[i])
if (!(f1 %in% fitur_drop) && !(f2 %in% fitur_drop)) {
fitur_drop <- c(fitur_drop, f2)
}
}
fitur_terpilih <- setdiff(fitur_numerik, fitur_drop)
cat("Fitur yang redundan:\n")
## Fitur yang redundan:
cat(paste(" -", fitur_drop, collapse = "\n"), "\n\n")
## - ConvexArea
## - ShapeFactor3
## - EquivDiameter
## - Compactness
## - ShapeFactor1
## - Perimeter
## - Eccentricity
## - MajorAxisLength
## - MinorAxisLength
## - ShapeFactor2
cat("Fitur yang dipertahankan:\n")
## Fitur yang dipertahankan:
cat(paste(" +", fitur_terpilih, collapse = "\n"), "\n\n")
## + Area
## + AspectRation
## + Extent
## + Solidity
## + roundness
## + ShapeFactor4
cat("Jumlah fitur sebelum seleksi:", length(fitur_numerik), "\n")
## Jumlah fitur sebelum seleksi: 16
cat("Jumlah fitur setelah seleksi:", length(fitur_terpilih), "\n")
## Jumlah fitur setelah seleksi: 6
tabel_seleksi <- data.frame(
Fitur = fitur_numerik,
Status = ifelse(fitur_numerik %in% fitur_drop, "Drop (Redundan)", "Dipertahankan")
)
tabel_seleksi %>%
kable(caption = "Status Fitur Setelah Feature Selection") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE, font_size = 12) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(which(tabel_seleksi$Status == "Drop (Redundan)"),
background = "#fde8e8", color = "#c0392b") %>%
row_spec(which(tabel_seleksi$Status == "Dipertahankan"),
background = "#eafaf1", color = "#1e8449")
Status Fitur Setelah Feature Selection
|
Fitur
|
Status
|
|
Area
|
Dipertahankan
|
|
Perimeter
|
Drop (Redundan)
|
|
MajorAxisLength
|
Drop (Redundan)
|
|
MinorAxisLength
|
Drop (Redundan)
|
|
AspectRation
|
Dipertahankan
|
|
Eccentricity
|
Drop (Redundan)
|
|
ConvexArea
|
Drop (Redundan)
|
|
EquivDiameter
|
Drop (Redundan)
|
|
Extent
|
Dipertahankan
|
|
Solidity
|
Dipertahankan
|
|
roundness
|
Dipertahankan
|
|
Compactness
|
Drop (Redundan)
|
|
ShapeFactor1
|
Drop (Redundan)
|
|
ShapeFactor2
|
Drop (Redundan)
|
|
ShapeFactor3
|
Drop (Redundan)
|
|
ShapeFactor4
|
Dipertahankan
|
5.2 PCA: Analisis Komponen Utama
df_seleksi <- df_clean %>% select(all_of(fitur_terpilih))
pca_result <- prcomp(df_seleksi, center = TRUE, scale. = TRUE)
var_explained <- summary(pca_result)$importance
var_df <- data.frame(
PC = paste0("PC", 1:ncol(var_explained)),
SD = round(var_explained[1, ], 4),
Proporsi_Var = round(var_explained[2, ], 4),
Kumulatif_Var = round(var_explained[3, ], 4)
)
var_df %>%
kable(caption = "Variansi yang Dijelaskan oleh Tiap Komponen Utama (PCA)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE, font_size = 12) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(which(var_df$Kumulatif_Var >= 0.90)[1],
background = "#eafaf1", color = "#1e8449")
Variansi yang Dijelaskan oleh Tiap Komponen Utama (PCA)
|
|
PC
|
SD
|
Proporsi_Var
|
Kumulatif_Var
|
|
PC1
|
PC1
|
1.7837
|
0.5302
|
0.5302
|
|
PC2
|
PC2
|
1.0328
|
0.1778
|
0.7080
|
|
PC3
|
PC3
|
0.8542
|
0.1216
|
0.8296
|
|
PC4
|
PC4
|
0.7082
|
0.0836
|
0.9132
|
|
PC5
|
PC5
|
0.6681
|
0.0744
|
0.9876
|
|
PC6
|
PC6
|
0.2727
|
0.0124
|
1.0000
|
5.3 Scree Plot PCA
fviz_eig(
pca_result,
addlabels = TRUE,
ylim = c(0, max(var_explained[2, ] * 100) + 5),
barfill = "#3498db",
barcolor = "white",
linecolor = "#e74c3c",
main = "Variansi Tiap Komponen Utama (PCA)"
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", hjust = 0.5))

5.4 Biplot PCA
fviz_pca_biplot(
pca_result,
geom.ind = "point",
col.ind = df_clean$Class,
palette = c("#2ecc71","#3498db","#e74c3c","#f39c12",
"#9b59b6","#1abc9c","#e67e22"),
addEllipses = TRUE,
ellipse.level = 0.90,
col.var = "black",
repel = TRUE,
legend.title = "Kelas",
title = "Fitur & Distribusi Kelas (PCA)"
) +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
## Ignoring unknown labels:
## • linetype : "Kelas"
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 7 values. Consider specifying shapes manually if you need
## that many of them.
## Warning: Removed 2636 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

5.5 Keputusan: Pilih Pendekatan
n_pc_90 <- min(which(var_df$Kumulatif_Var >= 0.90))
cat("PC yang dibutuhkan untuk menjelaskan >= 90% variansi:", n_pc_90, "\n")
## PC yang dibutuhkan untuk menjelaskan >= 90% variansi: 4
cat("Jumlah fitur setelah seleksi manual:", length(fitur_terpilih), "\n\n")
## Jumlah fitur setelah seleksi manual: 6
if (n_pc_90 < length(fitur_terpilih)) {
cat("Rekomendasi: Gunakan PCA dengan", n_pc_90, "komponen untuk clustering\n")
} else {
cat("Rekomendasi: Gunakan seleksi manual (", length(fitur_terpilih), "fitur) — PCA tidak memberikan reduksi yang signifikan\n")
}
## Rekomendasi: Gunakan PCA dengan 4 komponen untuk clustering
pc_scores <- as.data.frame(pca_result$x[, 1:n_pc_90])
pc_scores$Class <- df_clean$Class
cat("Dimensi data PC scores:", nrow(pc_scores), "baris x",
ncol(pc_scores) - 1, "komponen + kolom Class\n")
## Dimensi data PC scores: 13543 baris x 4 komponen + kolom Class
Tahap 6: Pisahkan Kolom Label (Class)
6.1 Pisahkan Fitur dan Label
X_clustering <- df_clean %>% select(all_of(fitur_terpilih))
y_label <- df_clean$Class
cat("Data untuk Clustering\n")
## Data untuk Clustering
cat("\nShape fitur (X):", nrow(X_clustering), "x", ncol(X_clustering), "\n")
##
## Shape fitur (X): 13543 x 6
cat("Panjang label (y):", length(y_label), "\n")
## Panjang label (y): 13543
cat("Label Class tersedia:", n_distinct(y_label), "kelas unik\n")
## Label Class tersedia: 7 kelas unik
cat("Nama kelas:", paste(unique(y_label), collapse = ", "), "\n\n")
## Nama kelas: SEKER, BARBUNYA, BOMBAY, CALI, HOROZ, SIRA, DERMASON
tabel_pisah <- data.frame(
Objek = c("X_clustering", "y_label"),
Isi = c(
paste0(ncol(X_clustering), " fitur numerik terpilih"),
"Kolom Class"
),
Dipakai_Saat = c(
"Training clustering",
"Evaluasi hasil clustering"
),
Dimensi = c(
paste0(nrow(X_clustering), " x ", ncol(X_clustering)),
paste0(length(y_label), " baris")
)
)
tabel_pisah %>%
kable(caption = "Pemisahan Fitur dan Label untuk Clustering") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE, font_size = 12) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(1, background = "#eafaf1", color = "#1e8449") %>%
row_spec(2, background = "#fef9e7", color = "#b7770d")
Pemisahan Fitur dan Label untuk Clustering
|
Objek
|
Isi
|
Dipakai_Saat
|
Dimensi
|
|
X_clustering
|
6 fitur numerik terpilih
|
Training clustering
|
13543 x 6
|
|
y_label
|
Kolom Class
|
Evaluasi hasil clustering
|
13543 baris
|
Tahap 7: Normalisasi / Standarisasi Fitur
7.1 Alasan Normalisasi
range_sebelum <- X_clustering %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
group_by(Fitur) %>%
summarise(
Min = round(min(Nilai), 2),
Max = round(max(Nilai), 2),
Range = round(max(Nilai) - min(Nilai), 2),
Mean = round(mean(Nilai), 2),
SD = round(sd(Nilai), 2),
.groups = "drop"
) %>%
arrange(desc(Range))
range_sebelum %>%
kable(caption = "Range Nilai Fitur Sebelum Normalisasi") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE, font_size = 12) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(1:3, background = "#fde8e8", color = "#c0392b")
Range Nilai Fitur Sebelum Normalisasi
|
Fitur
|
Min
|
Max
|
Range
|
Mean
|
SD
|
|
Area
|
20420.00
|
99031.25
|
78611.25
|
50166.95
|
18864.23
|
|
AspectRation
|
1.02
|
2.11
|
1.09
|
1.58
|
0.24
|
|
roundness
|
0.71
|
0.99
|
0.28
|
0.87
|
0.06
|
|
Extent
|
0.62
|
0.87
|
0.25
|
0.75
|
0.05
|
|
Solidity
|
0.98
|
0.99
|
0.02
|
0.99
|
0.00
|
|
ShapeFactor4
|
0.99
|
1.00
|
0.01
|
1.00
|
0.00
|
cat("\nFitur dengan range terbesar:",
range_sebelum$Fitur[1], "(range =", range_sebelum$Range[1], ")\n")
##
## Fitur dengan range terbesar: Area (range = 78611.25 )
cat("Fitur dengan range terkecil:",
range_sebelum$Fitur[nrow(range_sebelum)],
"(range =", range_sebelum$Range[nrow(range_sebelum)], ")\n")
## Fitur dengan range terkecil: ShapeFactor4 (range = 0.01 )
cat("\nPerbedaan skala yang besar akan mendominasi perhitungan jarak pada clustering\n")
##
## Perbedaan skala yang besar akan mendominasi perhitungan jarak pada clustering
7.2 Standardisasi Z-score (StandardScaler)
X_scaled_zscore <- X_clustering %>%
mutate(across(everything(), ~ (. - mean(.)) / sd(.)))
zscore_check <- X_scaled_zscore %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
group_by(Fitur) %>%
summarise(
Mean_Baru = round(mean(Nilai), 6),
SD_Baru = round(sd(Nilai), 6),
Min_Baru = round(min(Nilai), 4),
Max_Baru = round(max(Nilai), 4),
.groups = "drop"
)
zscore_check %>%
kable(caption = "Verifikasi Hasil Z-score Standardization") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE, font_size = 12) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(which(abs(zscore_check$Mean_Baru) < 1e-4 & abs(zscore_check$SD_Baru - 1) < 1e-4),
background = "#eafaf1", color = "#1e8449")
Verifikasi Hasil Z-score Standardization
|
Fitur
|
Mean_Baru
|
SD_Baru
|
Min_Baru
|
Max_Baru
|
|
Area
|
0
|
1
|
-1.5769
|
2.5903
|
|
AspectRation
|
0
|
1
|
-2.3166
|
2.2397
|
|
Extent
|
0
|
1
|
-2.7860
|
2.4199
|
|
ShapeFactor4
|
0
|
1
|
-2.4066
|
1.3271
|
|
Solidity
|
0
|
1
|
-2.3219
|
2.0201
|
|
roundness
|
0
|
1
|
-2.8469
|
2.0014
|
cat("\nMean semua fitur (harus ~0):",
round(mean(as.matrix(X_scaled_zscore)), 6), "\n")
##
## Mean semua fitur (harus ~0): 0
cat("SD semua fitur (harus ~1):",
round(mean(apply(X_scaled_zscore, 2, sd)), 6), "\n")
## SD semua fitur (harus ~1): 1
7.3 Min-Max Normalization (MinMaxScaler)
X_scaled_minmax <- X_clustering %>%
mutate(across(everything(), ~ (. - min(.)) / (max(.) - min(.))))
minmax_check <- X_scaled_minmax %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
group_by(Fitur) %>%
summarise(
Min_Baru = round(min(Nilai), 6),
Max_Baru = round(max(Nilai), 6),
Mean_Baru = round(mean(Nilai), 4),
.groups = "drop"
)
minmax_check %>%
kable(caption = "Verifikasi Hasil Min-Max Normalization (Rentang [0,1])") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE, font_size = 12) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(which(minmax_check$Min_Baru == 0 & minmax_check$Max_Baru == 1),
background = "#eafaf1", color = "#1e8449")
Verifikasi Hasil Min-Max Normalization (Rentang [0,1])
|
Fitur
|
Min_Baru
|
Max_Baru
|
Mean_Baru
|
|
Area
|
0
|
1
|
0.3784
|
|
AspectRation
|
0
|
1
|
0.5084
|
|
Extent
|
0
|
1
|
0.5352
|
|
ShapeFactor4
|
0
|
1
|
0.6446
|
|
Solidity
|
0
|
1
|
0.5348
|
|
roundness
|
0
|
1
|
0.5872
|
7.4 Perbandingan Distribusi: Sebelum vs Sesudah Normalisasi
fitur_sampel <- fitur_terpilih[1:min(6, length(fitur_terpilih))]
df_vis <- bind_rows(
X_clustering %>%
select(all_of(fitur_sampel)) %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
mutate(Kondisi = "Sebelum Normalisasi"),
X_scaled_zscore %>%
select(all_of(fitur_sampel)) %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
mutate(Kondisi = "Z-score"),
X_scaled_minmax %>%
select(all_of(fitur_sampel)) %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
mutate(Kondisi = "Min-Max")
) %>%
mutate(
Kondisi = factor(Kondisi,
levels = c("Sebelum Normalisasi", "Z-score", "Min-Max")),
Fitur = factor(Fitur, levels = fitur_sampel)
)
ggplot(df_vis, aes(x = Nilai, fill = Kondisi)) +
geom_histogram(bins = 40, color = "white", alpha = 0.8) +
scale_fill_manual(values = c("#e74c3c", "#3498db", "#2ecc71")) +
facet_grid(Kondisi ~ Fitur, scales = "free") +
labs(
title = "PeSebelum vs Sesudah Normalisasi",
subtitle = paste0("Ditampilkan ", length(fitur_sampel),
" fitur pertama (skala bebas per panel)"),
x = "Nilai", y = "Frekuensi",
fill = "Metode"
) +
theme_minimal(base_size = 10) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
strip.text = element_text(face = "bold", size = 9),
legend.position = "bottom"
)

7.5 Pilih Metode Normalisasi untuk Clustering
X_final <- X_scaled_zscore
cat("Data Final untuk Clustering\n")
## Data Final untuk Clustering
cat("\nMetode normalisasi: Z-score (StandardScaler)\n")
##
## Metode normalisasi: Z-score (StandardScaler)
cat("Shape X_final:", nrow(X_final), "x", ncol(X_final), "\n")
## Shape X_final: 13543 x 6
cat("Kolom:", paste(names(X_final), collapse = ", "), "\n\n")
## Kolom: Area, AspectRation, Extent, Solidity, roundness, ShapeFactor4
cat("Statistik X_final:\n")
## Statistik X_final:
print(round(sapply(X_final, function(x) c(mean=mean(x), sd=sd(x), min=min(x), max=max(x))), 4))
## Area AspectRation Extent Solidity roundness ShapeFactor4
## mean 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## sd 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
## min -1.5769 -2.3166 -2.7860 -2.3219 -2.8469 -2.4066
## max 2.5903 2.2397 2.4199 2.0201 2.0014 1.3271
8.1 Penentuan Jumlah Kluster Optimal
8.1.1 Metode Elbow (WSS)
set.seed(42)
wss_values <- sapply(1:10, function(k) {
kmeans(X_final, centers = k, nstart = 10, iter.max = 100)$tot.withinss
})
wss_df <- data.frame(k = 1:10, WSS = wss_values)
delta_wss <- diff(wss_values)
pct_drop <- abs(delta_wss) / wss_values[-length(wss_values)] * 100
elbow_k <- which(pct_drop < 10)[1] + 1
elbow_k <- if (is.na(elbow_k)) 7 else elbow_k
ggplot(wss_df, aes(x = k, y = WSS)) +
geom_line(color = "#3498db", linewidth = 1.2) +
geom_point(color = "#e74c3c", size = 4) +
geom_vline(xintercept = elbow_k, linetype = "dashed",
color = "#e74c3c", linewidth = 0.8) +
annotate("text", x = elbow_k + 0.35, y = max(wss_values) * 0.85,
label = paste0("k = ", elbow_k, " (siku)"),
color = "#e74c3c", size = 4, fontface = "italic") +
scale_x_continuous(breaks = 1:10) +
labs(
title = "Metode Elbow — Penentuan Jumlah Kluster Optimal (K-Means)",
subtitle = "Cari titik 'siku' di mana WSS mulai melandai",
x = "Jumlah Kluster (k)", y = "Total Within-Cluster SS (WSS)"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50")
)

8.1.2 Metode Silhouette
set.seed(42)
samp_idx_sil <- sample(nrow(X_final), min(2000, nrow(X_final)))
X_sil_sample <- X_final[samp_idx_sil, ]
sil_values_km <- sapply(2:8, function(k) {
km <- kmeans(X_final, centers = k, nstart = 10, iter.max = 100)
sil <- silhouette(km$cluster[samp_idx_sil], dist(X_sil_sample))
mean(sil[, 3])
})
sil_df_km <- data.frame(k = 2:8, Silhouette = sil_values_km)
best_k_km <- sil_df_km$k[which.max(sil_df_km$Silhouette)]
ggplot(sil_df_km, aes(x = k, y = Silhouette)) +
geom_line(color = "#2ecc71", linewidth = 1.2) +
geom_point(aes(color = k == best_k_km), size = 4, show.legend = FALSE) +
scale_color_manual(values = c("FALSE" = "#2ecc71", "TRUE" = "#e74c3c")) +
geom_vline(xintercept = best_k_km, linetype = "dashed",
color = "#e74c3c", linewidth = 0.8) +
annotate("text", x = best_k_km + 0.3,
y = max(sil_values_km) * 0.97,
label = paste0("k = ", best_k_km, " (optimal)"),
color = "#e74c3c", size = 4, fontface = "italic") +
scale_x_continuous(breaks = 2:8) +
labs(
title = "Metode Silhouette — Penentuan Jumlah Kluster Optimal (K-Means)",
subtitle = "Skor lebih tinggi = kluster lebih kohesif & terpisah",
x = "Jumlah Kluster (k)", y = "Rata-rata Silhouette Score"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50")
)

cat("k optimal berdasarkan Silhouette:", best_k_km, "\n")
## k optimal berdasarkan Silhouette: 2
8.1.3 Keputusan k
k_optimal <- 7
cat(" Keputusan: k =", k_optimal, "\n")
## Keputusan: k = 7
cat(" Alasan : Sesuai jumlah kelas asli (7 jenis kacang)\n")
## Alasan : Sesuai jumlah kelas asli (7 jenis kacang)
cat(" + dikonfirmasi Elbow & Silhouette\n")
## + dikonfirmasi Elbow & Silhouette
8.2 Training K-Means
set.seed(42)
km_result <- kmeans(
X_final,
centers = k_optimal,
nstart = 25,
iter.max = 300,
algorithm = "Hartigan-Wong"
)
km_labels <- km_result$cluster
cat("K-Means selesai!\n\n")
## K-Means selesai!
cat("Algoritma : Hartigan-Wong\n")
## Algoritma : Hartigan-Wong
cat("Metrik jarak : Euclidean (L2 norm)\n")
## Metrik jarak : Euclidean (L2 norm)
cat("Pusat kluster : Mean (rata-rata koordinat)\n")
## Pusat kluster : Mean (rata-rata koordinat)
cat("Jumlah kluster :", nrow(km_result$centers), "\n")
## Jumlah kluster : 7
cat("Iterasi konvergen :", km_result$iter, "\n")
## Iterasi konvergen : 6
cat("Total WSS :", round(km_result$tot.withinss, 2), "\n")
## Total WSS : 25231.11
cat("Between-SS/Total-SS:",
round(km_result$betweenss / km_result$totss * 100, 2), "%\n\n")
## Between-SS/Total-SS: 68.95 %
cat("Distribusi anggota per kluster:\n")
## Distribusi anggota per kluster:
print(table(km_labels))
## km_labels
## 1 2 3 4 5 6 7
## 2788 1408 2722 2484 1091 1179 1871
8.3 Evaluasi Internal K-Means
8.3.1 Silhouette Plot
set.seed(42)
samp_km <- sample(nrow(X_final), min(2000, nrow(X_final)))
sil_km <- silhouette(km_labels[samp_km], dist(X_final[samp_km, ]))
avg_sil_km <- mean(sil_km[, 3])
fviz_silhouette(sil_km,
palette = "jco",
ggtheme = theme_minimal(base_size = 11),
print.summary = FALSE) +
geom_hline(yintercept = avg_sil_km, linetype = "dashed",
color = "#e74c3c", linewidth = 0.8) +
labs(
title = "K-Means Clustering (Silhouette)",
subtitle = paste0("Rata-rata Silhouette Score = ", round(avg_sil_km, 4),
" | Range: -1 (buruk) hingga 1 (sempurna)")
) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50")
)

8.3.2 Tabel Metrik Internal
eval_km <- data.frame(
Metrik = c("Silhouette Score",
"Between-SS / Total-SS (%)",
"Total Within-SS",
"Iterasi Konvergen"),
Nilai = c(round(avg_sil_km, 4),
round(km_result$betweenss / km_result$totss * 100, 2),
round(km_result$tot.withinss, 2),
km_result$iter),
Keterangan = c("Mendekati 1 = kluster padat & terpisah",
"Semakin tinggi = pemisahan kluster lebih baik",
"Semakin kecil = kluster lebih kompak",
"Cepat konvergen = solusi stabil")
)
eval_km %>%
kable(caption = "Evaluasi Internal — K-Means Clustering") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE, font_size = 12) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(1, background = "#eafaf1", color = "#1e8449")
Evaluasi Internal — K-Means Clustering
|
Metrik
|
Nilai
|
Keterangan
|
|
Silhouette Score
|
0.2688
|
Mendekati 1 = kluster padat & terpisah
|
|
Between-SS / Total-SS (%)
|
68.9500
|
Semakin tinggi = pemisahan kluster lebih baik
|
|
Total Within-SS
|
25231.1100
|
Semakin kecil = kluster lebih kompak
|
|
Iterasi Konvergen
|
6.0000
|
Cepat konvergen = solusi stabil
|
8.4 Visualisasi Hasil K-Means
8.4.1 Cluster Plot (PCA 2D)
warna_kluster <- c("#2ecc71","#3498db","#e74c3c","#f39c12",
"#9b59b6","#1abc9c","#e67e22")
fviz_cluster(
km_result,
data = X_final,
palette = warna_kluster,
geom = "point",
ellipse.type = "convex",
alpha = 0.4,
pointsize = 1,
ggtheme = theme_minimal(base_size = 12)
) +
labs(
title = "K-Means Clustering (PCA 2D)",
subtitle = paste0("k = ", k_optimal, "Proyeksi ke 2 PC pertama"),
color = "Kluster", fill = "Kluster", shape = "Kluster"
) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50")
)

8.4.2 Distribusi Anggota per Kluster
dist_km <- data.frame(
Kluster = paste0("K", sort(unique(km_labels))),
Jumlah = as.integer(table(km_labels))
) %>%
mutate(
Persentase = percent(Jumlah / sum(Jumlah), accuracy = 0.1),
Kluster = reorder(Kluster, -Jumlah)
)
ggplot(dist_km, aes(x = Kluster, y = Jumlah, fill = Kluster)) +
geom_col(width = 0.7, show.legend = FALSE) +
geom_text(aes(label = paste0(Jumlah, "\n", Persentase)),
vjust = -0.3, size = 3.8, fontface = "bold") +
scale_fill_manual(values = warna_kluster) +
scale_y_continuous(expand = expansion(mult = c(0, 0.18))) +
labs(title = "Distribusi Anggota per Kluster — K-Means",
x = "Kluster", y = "Jumlah Anggota") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", hjust = 0.5),
axis.text.x = element_text(face = "bold"))

8.4.3 Heatmap Profil Centroid
centroid_km <- as.data.frame(km_result$centers) %>%
mutate(Kluster = paste0("K", 1:k_optimal)) %>%
pivot_longer(-Kluster, names_to = "Fitur", values_to = "Nilai")
ggplot(centroid_km, aes(x = Fitur, y = Kluster, fill = Nilai)) +
geom_tile(color = "white", linewidth = 0.5) +
geom_text(aes(label = round(Nilai, 2)), size = 3.2, fontface = "bold") +
scale_fill_gradient2(low = "#e74c3c", mid = "white", high = "#2ecc71",
midpoint = 0, name = "Z-score") +
labs(
title = "Profil Centroid K-Means (Z-score)",
subtitle = "Centroid = rata-rata koordinat semua titik di kluster | Merah = rendah | Hijau = tinggi",
x = "Fitur", y = "Kluster"
) +
theme_minimal(base_size = 11) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
axis.text.x = element_text(angle = 35, hjust = 1, face = "bold")
)

8.5 Evaluasi Eksternal K-Means
purity_score <- function(labels_true, labels_pred) {
tbl <- table(labels_true, labels_pred)
sum(apply(tbl, 2, max)) / length(labels_true)
}
purity_km <- purity_score(y_label, km_labels)
table(Kluster = km_labels, Kelas_Asli = y_label) %>%
as.data.frame.matrix() %>%
rownames_to_column("Kluster") %>%
mutate(Kluster = paste0("K", Kluster)) %>%
kable(caption = "Tabel Kontingensi — K-Means vs Kelas Asli") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE, font_size = 12) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Tabel Kontingensi — K-Means vs Kelas Asli
|
Kluster
|
BARBUNYA
|
BOMBAY
|
CALI
|
DERMASON
|
HOROZ
|
SEKER
|
SIRA
|
|
K1
|
52
|
0
|
16
|
1407
|
15
|
95
|
1203
|
|
K2
|
76
|
127
|
710
|
40
|
296
|
4
|
155
|
|
K3
|
55
|
0
|
21
|
1456
|
24
|
52
|
1114
|
|
K4
|
3
|
1
|
2
|
583
|
0
|
1846
|
49
|
|
K5
|
943
|
0
|
15
|
52
|
20
|
30
|
31
|
|
K6
|
18
|
2
|
65
|
8
|
1036
|
0
|
50
|
|
K7
|
175
|
392
|
801
|
0
|
469
|
0
|
34
|
cat("\nPurity Score K-Means:", round(purity_km, 4),
paste0("(", round(purity_km * 100, 2), "%)"), "\n")
##
## Purity Score K-Means: 0.6054 (60.54%)
8.6 Kesimpulan K-Means
data.frame(
Aspek = c("Algoritma", "Pusat Kluster", "Metrik Jarak",
"Jumlah Kluster", "Silhouette Score",
"Between-SS / Total-SS", "Purity Score", "Status"),
Hasil = c("K-Means (Hartigan-Wong)",
"Mean / rata-rata koordinat (bukan titik data aktual)",
"Euclidean (L2 norm)",
paste0(k_optimal, " kluster"),
round(avg_sil_km, 4),
paste0(round(km_result$betweenss / km_result$totss * 100, 2), "%"),
round(purity_km, 4),
"lanjut ke Tahap K-Median")
) %>%
kable(caption = "Kesimpulan Tahap 8: K-Means Clustering") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(8, background = "#eafaf1", color = "#1e8449")
Kesimpulan Tahap 8: K-Means Clustering
|
Aspek
|
Hasil
|
|
Algoritma
|
K-Means (Hartigan-Wong)
|
|
Pusat Kluster
|
Mean / rata-rata koordinat (bukan titik data aktual)
|
|
Metrik Jarak
|
Euclidean (L2 norm)
|
|
Jumlah Kluster
|
7 kluster
|
|
Silhouette Score
|
0.2688
|
|
Between-SS / Total-SS
|
68.95%
|
|
Purity Score
|
0.6054
|
|
Status
|
lanjut ke Tahap K-Median
|
Tahap 10: DBSCAN Clustering
10.1 Penentuan Parameter eps (k-distance plot)
set.seed(42)
samp_db <- sample(nrow(X_final), min(3000, nrow(X_final)))
X_db_sample <- X_final[samp_db, ]
minPts <- ncol(X_final) * 2
kNNdistplot(X_db_sample, k = minPts)
abline(h = 1.5, col = "red", lty = 2, lwd = 2)
title(main = "k-distance Plot — Penentuan Nilai eps DBSCAN",
sub = paste0("Garis merah = kandidat eps | minPts = ", minPts))

10.2 Training DBSCAN
set.seed(42)
eps_val <- 1.5
minPts_val <- ncol(X_final) * 2
db_result <- dbscan::dbscan(X_final, eps = eps_val, minPts = minPts_val)
db_labels <- db_result$cluster # 0 = noise
cat("DBSCAN selesai!\n\n")
## DBSCAN selesai!
cat("Parameter eps :", eps_val, "\n")
## Parameter eps : 1.5
cat("Parameter minPts :", minPts_val, "\n")
## Parameter minPts : 12
cat("Jumlah kluster :", max(db_labels), "\n")
## Jumlah kluster : 1
cat("Titik noise (0) :", sum(db_labels == 0), "\n\n")
## Titik noise (0) : 0
cat("Distribusi label:\n")
## Distribusi label:
print(table(db_labels))
## db_labels
## 1
## 13543
10.3 Evaluasi Internal DBSCAN
set.seed(42)
non_noise_idx <- which(db_labels != 0)
if (length(unique(db_labels[non_noise_idx])) >= 2) {
samp_db2 <- sample(non_noise_idx, min(2000, length(non_noise_idx)))
sil_db <- silhouette(db_labels[samp_db2], dist(X_final[samp_db2, ]))
avg_sil_db <- mean(sil_db[, 3])
fviz_silhouette(sil_db,
palette = "jco",
ggtheme = theme_minimal(base_size = 11),
print.summary = FALSE) +
geom_hline(yintercept = avg_sil_db, linetype = "dashed",
color = "#e74c3c", linewidth = 0.8) +
labs(
title = "Silhouette Plot — DBSCAN Clustering",
subtitle = paste0("Rata-rata Silhouette Score = ", round(avg_sil_db, 4),
" | Noise points dikecualikan")
) +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"))
} else {
avg_sil_db <- NA
cat("Tidak cukup kluster untuk menghitung Silhouette.\n")
}
## Tidak cukup kluster untuk menghitung Silhouette.
10.4 Visualisasi DBSCAN (PCA 2D)
pca_2d <- as.data.frame(prcomp(X_final, center = TRUE, scale. = TRUE)$x[, 1:2])
pca_2d$Kluster <- factor(ifelse(db_labels == 0, "Noise", paste0("K", db_labels)))
warna_db <- c("Noise" = "gray70",
setNames(c("#2ecc71","#3498db","#e74c3c","#f39c12",
"#9b59b6","#1abc9c","#e67e22","#34495e"),
paste0("K", 1:8)))
ggplot(pca_2d, aes(x = PC1, y = PC2, color = Kluster)) +
geom_point(alpha = 0.4, size = 0.8) +
scale_color_manual(values = warna_db) +
labs(
title = "Visualisasi DBSCAN Clustering (PCA 2D)",
subtitle = paste0("eps = ", eps_val, " | minPts = ", minPts_val,
" | Kluster = ", max(db_labels),
" | Noise = ", sum(db_labels == 0), " titik"),
color = "Label"
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

10.5 Evaluasi Eksternal DBSCAN
non_noise <- db_labels != 0
purity_db <- purity_score(y_label[non_noise], db_labels[non_noise])
cat("Purity Score DBSCAN (non-noise):", round(purity_db, 4),
paste0("(", round(purity_db * 100, 2), "%)"), "\n")
## Purity Score DBSCAN (non-noise): 0.2618 (26.18%)
cat("Proporsi noise:", round(mean(db_labels == 0) * 100, 2), "%\n")
## Proporsi noise: 0 %
Tahap 11: Mean Shift Clustering
11.1 Training Mean Shift
set.seed(42)
samp_ms <- sample(nrow(X_final), min(800, nrow(X_final)))
X_ms <- as.matrix(X_final[samp_ms, ])
ms_result <- ms(X_ms, h = 1.5, scaled = FALSE)

ms_labels_samp <- ms_result$cluster.label
n_kluster_ms <- length(unique(ms_labels_samp))
cat("Mean Shift selesai!\n\n")
## Mean Shift selesai!
cat("Bandwidth (h) :", 1.5, "\n")
## Bandwidth (h) : 1.5
cat("Jumlah kluster :", n_kluster_ms, "\n\n")
## Jumlah kluster : 1
cat("Distribusi label (subsample):\n")
## Distribusi label (subsample):
print(table(ms_labels_samp))
## ms_labels_samp
## 1
## 800
11.2 Assign Label ke Full Data (nearest mode)
mean_shift <- function(X, h, max_iter = 100, tol = 1e-4) {
n <- nrow(X)
modes <- X
for (iter in 1:max_iter) {
modes_new <- modes
for (i in 1:n) {
dists <- rowSums((X - matrix(modes[i, ], n, ncol(X), byrow = TRUE))^2)
weights <- exp(-dists / (2 * h^2))
modes_new[i, ] <- colSums(X * weights) / sum(weights)
}
shift <- max(sqrt(rowSums((modes_new - modes)^2)))
modes <- modes_new
if (shift < tol) break
}
return(modes)
}
assign_clusters <- function(modes, tol = 0.5) {
n <- nrow(modes)
labels <- rep(0, n)
k <- 0
for (i in 1:n) {
if (labels[i] == 0) {
k <- k + 1
labels[i] <- k
}
if (i < n) {
for (j in (i+1):n) {
if (labels[j] == 0) {
dist_ij <- sqrt(sum((modes[i, ] - modes[j, ])^2))
if (dist_ij < tol) labels[j] <- labels[i]
}
}
}
}
return(labels)
}
assign_to_mode <- function(X, centers) {
apply(X, 1, function(xi) {
dists <- apply(centers, 1, function(m) sqrt(sum((xi - m)^2)))
which.min(dists)
})
}
cat("Fungsi mean_shift, assign_clusters, assign_to_mode berhasil didefinisikan.\n")
## Fungsi mean_shift, assign_clusters, assign_to_mode berhasil didefinisikan.
set.seed(42)
samp_ms <- sample(nrow(X_final), min(500, nrow(X_final)))
X_ms <- as.matrix(X_final[samp_ms, ])
cat("Menjalankan Mean Shift dengan h = 0.8...\n")
## Menjalankan Mean Shift dengan h = 0.8...
final_modes <- mean_shift(X_ms, h = 0.8, max_iter = 50, tol = 1e-3)
ms_labels_samp <- assign_clusters(final_modes, tol = 0.3)
n_kluster_ms <- length(unique(ms_labels_samp))
cat("Jumlah kluster ditemukan:", n_kluster_ms, "\n")
## Jumlah kluster ditemukan: 3
print(table(ms_labels_samp))
## ms_labels_samp
## 1 2 3
## 435 41 24
if (n_kluster_ms == 1) {
cat("\nMasih 1 kluster, mencoba h = 0.5...\n")
final_modes <- mean_shift(X_ms, h = 0.5, max_iter = 50, tol = 1e-3)
ms_labels_samp <- assign_clusters(final_modes, tol = 0.2)
n_kluster_ms <- length(unique(ms_labels_samp))
cat("Jumlah kluster ditemukan:", n_kluster_ms, "\n")
print(table(ms_labels_samp))
}
mode_centers <- t(sapply(unique(ms_labels_samp), function(k) {
colMeans(X_ms[ms_labels_samp == k, , drop = FALSE])
}))
ms_labels_full <- assign_to_mode(as.matrix(X_final), mode_centers)
cat("\nDistribusi label full data:\n")
##
## Distribusi label full data:
print(table(ms_labels_full))
## ms_labels_full
## 1 2 3
## 10220 2102 1221
11.3 Evaluasi Internal Mean Shift
set.seed(42)
samp_ms2 <- sample(nrow(X_final), min(2000, nrow(X_final)))
n_kluster_full <- length(unique(ms_labels_full))
cat("Jumlah kluster unik di full data:", n_kluster_full, "\n")
## Jumlah kluster unik di full data: 3
if (n_kluster_full >= 2) {
sil_ms <- silhouette(ms_labels_full[samp_ms2], dist(X_final[samp_ms2, ]))
avg_sil_ms <- mean(sil_ms[, 3])
fviz_silhouette(sil_ms,
palette = "jco",
ggtheme = theme_minimal(base_size = 11),
print.summary = FALSE) +
geom_hline(yintercept = avg_sil_ms, linetype = "dashed",
color = "#e74c3c", linewidth = 0.8) +
labs(
title = "Silhouette Plot — Mean Shift Clustering",
subtitle = paste0("Rata-rata Silhouette Score = ", round(avg_sil_ms, 4))
) +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"))
} else {
avg_sil_ms <- NA
cat("Hanya 1 kluster terbentuk.\n")
}

11.4 Visualisasi Mean Shift (PCA 2D)
pca_2d_ms <- as.data.frame(prcomp(X_final, center = TRUE, scale. = TRUE)$x[, 1:2])
pca_2d_ms$Kluster <- factor(paste0("K", ms_labels_full))
warna_ms <- setNames(
c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6",
"#1abc9c","#e67e22","#34495e","#16a085","#8e44ad"),
paste0("K", 1:10)
)
ggplot(pca_2d_ms, aes(x = PC1, y = PC2, color = Kluster)) +
geom_point(alpha = 0.4, size = 0.8) +
scale_color_manual(values = warna_ms) +
labs(
title = "Visualisasi Mean Shift Clustering (PCA 2D)",
subtitle = paste0("Bandwidth h = 1.5 | ",
n_kluster_ms, " kluster ditemukan otomatis"),
color = "Kluster"
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

11.5 Visualisasi (PCA 2D)
pca_2d_ms <- as.data.frame(prcomp(X_final, center = TRUE, scale. = TRUE)$x[, 1:2])
pca_2d_ms$Kluster <- factor(paste0("K", ms_labels_full))
warna_ms <- setNames(
c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6",
"#1abc9c","#e67e22","#34495e","#16a085","#8e44ad"),
paste0("K", 1:10)
)
ggplot(pca_2d_ms, aes(x = PC1, y = PC2, color = Kluster)) +
geom_point(alpha = 0.4, size = 0.8) +
scale_color_manual(values = warna_ms) +
labs(
title = "Visualisasi Mean Shift Clustering (PCA 2D)",
subtitle = paste0("Bandwidth h = 1.5"),
color = "Kluster"
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

11.6 Evaluasi Eksternal
purity_ms <- purity_score(y_label, ms_labels_full)
cat("Purity Score Mean Shift:", round(purity_ms, 4),
paste0("(", round(purity_ms * 100, 2), "%)"), "\n")
## Purity Score Mean Shift: 0.3939 (39.39%)
Tahap 12: Fuzzy C-Means Clustering
##12.1 Training Fuzzy C-means
## 12.1 Training Fuzzy C-Means
library(e1071)
# Definisikan ulang jika session di-restart
k_optimal <- 7
set.seed(42)
fcm_result <- cmeans(
as.matrix(X_final),
centers = k_optimal,
iter.max = 1000,
m = 2,
dist = "euclidean",
method = "cmeans"
)
fcm_labels <- fcm_result$cluster
fcm_membership <- fcm_result$membership
cat("Fuzzy C-Means selesai!\n\n")
## Fuzzy C-Means selesai!
cat("Fuzzifier (m) : 2\n")
## Fuzzifier (m) : 2
cat("Metrik jarak : Euclidean\n")
## Metrik jarak : Euclidean
cat("Jumlah kluster :", k_optimal, "\n")
## Jumlah kluster : 7
cat("Withiness (obj) :", round(fcm_result$withinerror, 4), "\n\n")
## Withiness (obj) : 0.7123
cat("Distribusi label hard:\n")
## Distribusi label hard:
print(table(fcm_labels))
## fcm_labels
## 1 2 3 4 5 6 7
## 2702 2747 1208 1489 1824 2307 1266
12.2 Visualisasi Matriks Keanggotaan Fuzzy (200 sampel)
library(tidyverse)
set.seed(42)
samp_fcm <- sample(nrow(fcm_membership), 200)
membership_long <- as.data.frame(fcm_membership[samp_fcm, ]) %>%
mutate(Observasi = row_number()) %>%
pivot_longer(-Observasi,
names_to = "Kluster",
values_to = "Keanggotaan") %>%
mutate(Kluster = paste0("K", Kluster))
ggplot(membership_long,
aes(x = Observasi, y = Keanggotaan, fill = Kluster)) +
geom_area(alpha = 0.8, position = "stack") +
scale_fill_manual(values = c("#2ecc71","#3498db","#e74c3c",
"#f39c12","#9b59b6","#1abc9c","#e67e22")) +
labs(
title = "Matriks Keanggotaan Fuzzy C-Means (200 sampel)",
subtitle = "Setiap observasi memiliki derajat keanggotaan di semua kluster",
x = "Observasi (sampel)", y = "Derajat Keanggotaan",
fill = "Kluster"
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

12.3 Evaluasi Internal — Silhouette
set.seed(42)
samp_fcm2 <- sample(nrow(X_final), min(2000, nrow(X_final)))
sil_fcm <- silhouette(fcm_labels[samp_fcm2], dist(X_final[samp_fcm2, ]))
avg_sil_fcm <- mean(sil_fcm[, 3])
fviz_silhouette(sil_fcm,
palette = "jco",
ggtheme = theme_minimal(base_size = 11),
print.summary = FALSE) +
geom_hline(yintercept = avg_sil_fcm, linetype = "dashed",
color = "#e74c3c", linewidth = 0.8) +
labs(
title = "Silhouette Plot — Fuzzy C-Means Clustering",
subtitle = paste0("Rata-rata Silhouette Score = ", round(avg_sil_fcm, 4),
" | m = 2 (fuzzifier)")
) +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

12.4 Visualisasi Cluster Plot (PCA 2D)
pca_2d_fcm <- as.data.frame(
prcomp(X_final, center = TRUE, scale. = TRUE)$x[, 1:2]
)
pca_2d_fcm$Kluster <- factor(paste0("K", fcm_labels))
ggplot(pca_2d_fcm, aes(x = PC1, y = PC2, color = Kluster)) +
geom_point(alpha = 0.4, size = 0.8) +
scale_color_manual(values = c("#2ecc71","#3498db","#e74c3c",
"#f39c12","#9b59b6","#1abc9c","#e67e22")) +
stat_ellipse(type = "convex", alpha = 0.12, geom = "polygon",
aes(fill = Kluster), show.legend = FALSE) +
scale_fill_manual(values = c("#2ecc71","#3498db","#e74c3c",
"#f39c12","#9b59b6","#1abc9c","#e67e22")) +
labs(
title = "Visualisasi Fuzzy C-Means Clustering (PCA 2D)",
subtitle = paste0("k = ", k_optimal, " | m = 2 | Label = hard assignment"),
color = "Kluster"
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"))
## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type

12.5 Heatmap Centroid Fuzzy C-Means
centroid_fcm <- as.data.frame(fcm_result$centers) %>%
mutate(Kluster = paste0("K", 1:k_optimal)) %>%
pivot_longer(-Kluster, names_to = "Fitur", values_to = "Nilai")
ggplot(centroid_fcm, aes(x = Fitur, y = Kluster, fill = Nilai)) +
geom_tile(color = "white", linewidth = 0.5) +
geom_text(aes(label = round(Nilai, 2)), size = 3.2, fontface = "bold") +
scale_fill_gradient2(low = "#e74c3c", mid = "white", high = "#3498db",
midpoint = 0, name = "Z-score") +
labs(
title = "Profil Centroid Fuzzy C-Means (Z-score)",
subtitle = "Centroid fuzzy = rata-rata berbobot keanggotaan | Merah = rendah | Biru = tinggi",
x = "Fitur", y = "Kluster"
) +
theme_minimal(base_size = 11) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 13),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
axis.text.x = element_text(angle = 35, hjust = 1, face = "bold")
)

12.6 Evaluasi Eksternal — Purity Score
purity_score <- function(labels_true, labels_pred) {
tbl <- table(labels_true, labels_pred)
sum(apply(tbl, 2, max)) / length(labels_true)
}
purity_fcm <- purity_score(y_label, fcm_labels)
table(Kluster = fcm_labels, Kelas_Asli = y_label) %>%
as.data.frame.matrix() %>%
rownames_to_column("Kluster") %>%
mutate(Kluster = paste0("K", Kluster)) %>%
kable(caption = "Tabel Kontingensi — Fuzzy C-Means vs Kelas Asli") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE, font_size = 12) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Tabel Kontingensi — Fuzzy C-Means vs Kelas Asli
|
Kluster
|
BARBUNYA
|
BOMBAY
|
CALI
|
DERMASON
|
HOROZ
|
SEKER
|
SIRA
|
|
K1
|
35
|
0
|
13
|
1505
|
16
|
77
|
1056
|
|
K2
|
41
|
0
|
19
|
1482
|
9
|
114
|
1082
|
|
K3
|
798
|
0
|
70
|
46
|
81
|
18
|
195
|
|
K4
|
203
|
147
|
701
|
48
|
278
|
5
|
107
|
|
K5
|
184
|
369
|
756
|
0
|
434
|
0
|
81
|
|
K6
|
3
|
4
|
2
|
451
|
0
|
1813
|
34
|
|
K7
|
58
|
2
|
69
|
14
|
1042
|
0
|
81
|
cat("\nPurity Score Fuzzy C-Means:", round(purity_fcm, 4),
paste0("(", round(purity_fcm * 100, 2), "%)"), "\n")
##
## Purity Score Fuzzy C-Means: 0.5979 (59.79%)
12.7 Kesimpulan Fuzzy C-Means
data.frame(
Aspek = c("Algoritma", "Pusat Kluster", "Metrik Jarak",
"Fuzzifier (m)", "Jumlah Kluster",
"Silhouette Score", "Purity Score", "Status"),
Hasil = c("Fuzzy C-Means (cmeans)",
"Centroid fuzzy (rata-rata berbobot keanggotaan)",
"Euclidean",
"2 (standar)",
paste0(k_optimal, " kluster"),
round(avg_sil_fcm, 4),
round(purity_fcm, 4),
"Selesai — lanjut ke Tahap 13 Perbandingan")
) %>%
kable(caption = "Kesimpulan Tahap 12: Fuzzy C-Means Clustering") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
row_spec(8, background = "#eafaf1", color = "#1e8449")
Kesimpulan Tahap 12: Fuzzy C-Means Clustering
|
Aspek
|
Hasil
|
|
Algoritma
|
Fuzzy C-Means (cmeans)
|
|
Pusat Kluster
|
Centroid fuzzy (rata-rata berbobot keanggotaan)
|
|
Metrik Jarak
|
Euclidean
|
|
Fuzzifier (m)
|
2 (standar)
|
|
Jumlah Kluster
|
7 kluster
|
|
Silhouette Score
|
0.253
|
|
Purity Score
|
0.5979
|
|
Status
|
Selesai — lanjut ke Tahap 13 Perbandingan
|