Bagian ini bertujuan untuk memahami struktur data awal dan sebaran fitur sebelum dilakukan pemrosesan.
Memuat semua library yang diperlukan selama proses berlangsung dan file dataset.csv. Karena file aslinya tidak memiliki header, pemberian nama kolom akan dilakukan secara manual sesuai dengan atribut Breast Cancer Wisconsin.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.5.3
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(factoextra)
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
# Membaca data
df <- read.csv("dataset.csv", header = FALSE)
# Definisi nama kolom sesuai UCI Repository
features <- c("radius", "texture", "perimeter", "area", "smoothness",
"compactness", "concavity", "concave_points", "symmetry", "fractal_dimension")
column_names <- c("id", "diagnosis",
paste0(features, "_mean"),
paste0(features, "_se"),
paste0(features, "_worst"))
colnames(df) <- column_names
# Data Cleaning: Menghapus ID dan konversi Target
df <- df %>%
select(-id) %>%
mutate(diagnosis = as.factor(diagnosis))
summary(df[, 1:6]) # Menampilkan 6 kolom pertama untuk ringkasan
## diagnosis radius_mean texture_mean perimeter_mean area_mean
## B:357 Min. : 6.981 Min. : 9.71 Min. : 43.79 Min. : 143.5
## M:212 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17 1st Qu.: 420.3
## Median :13.370 Median :18.84 Median : 86.24 Median : 551.1
## Mean :14.127 Mean :19.29 Mean : 91.97 Mean : 654.9
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10 3rd Qu.: 782.7
## Max. :28.110 Max. :39.28 Max. :188.50 Max. :2501.0
## smoothness_mean
## Min. :0.05263
## 1st Qu.:0.08637
## Median :0.09587
## Mean :0.09636
## 3rd Qu.:0.10530
## Max. :0.16340
Menunjukkan perbandingan jumlah tumor yang bersifat Ganas (Malignant) dan Jinak (Benign).
ggplot(df, aes(x = diagnosis, fill = diagnosis)) +
geom_bar() +
labs(title = "Distribusi Diagnosis (B = Benign, M = Malignant)",
x = "Diagnosis", y = "Jumlah") +
theme_minimal()
Melihat perbedaan nilai radius_mean dan texture_mean terhadap jenis diagnosis.
p1 <- ggplot(df, aes(x = diagnosis, y = radius_mean, fill = diagnosis)) +
geom_boxplot() +
labs(title = "Boxplot Radius Mean vs Diagnosis") +
theme_minimal()
p2 <- ggplot(df, aes(x = diagnosis, y = texture_mean, fill = diagnosis)) +
geom_boxplot() +
labs(title = "Boxplot Texture Mean vs Diagnosis") +
theme_minimal()
grid.arrange(p1, p2, ncol = 2)
Memahami hubungan antar fitur dengan matriks korelasi.
cor_matrix <- cor(df[, 2:11])
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45,
addCoef.col = "black", number.cex = 0.7,
title = "\nMatriks Korelasi Fitur Mean", mar = c(0,0,1,0))
Melihat tumpang tindih (overlap) distribusi data antar diagnosis.
ggplot(df, aes(x = area_mean, fill = diagnosis)) +
geom_density(alpha = 0.5) +
labs(title = "Density Plot Area Mean berdasarkan Diagnosis") +
theme_minimal()
Karena fitur-fitur dalam dataset ini memiliki rentang nilai yang sangat berbeda, maka perlu dilakukan scaling agar semua fitur memiliki bobot yang sama saat proses clustering.
df_numeric <- df %>% select(-diagnosis)
df_scaled <- as.data.frame(scale(df_numeric))
summary(df_scaled[, 1:5])
## radius_mean texture_mean perimeter_mean area_mean
## Min. :-2.0279 Min. :-2.2273 Min. :-1.9828 Min. :-1.4532
## 1st Qu.:-0.6888 1st Qu.:-0.7253 1st Qu.:-0.6913 1st Qu.:-0.6666
## Median :-0.2149 Median :-0.1045 Median :-0.2358 Median :-0.2949
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4690 3rd Qu.: 0.5837 3rd Qu.: 0.4992 3rd Qu.: 0.3632
## Max. : 3.9678 Max. : 4.6478 Max. : 3.9726 Max. : 5.2459
## smoothness_mean
## Min. :-3.10935
## 1st Qu.:-0.71034
## Median :-0.03486
## Mean : 0.00000
## 3rd Qu.: 0.63564
## Max. : 4.76672
PCA digunakan untuk menyederhanakan 30 fitur menjadi 2 komponen utama guna keperluan visualisasi spasial.
pca_res <- prcomp(df_scaled, center = TRUE, scale. = TRUE)
df_pca <- as.data.frame(pca_res$x[, 1:2])
library(factoextra)
Gunakan data yang sudah di-scale (df_scaled).
fviz_nbclust(df_scaled, kmeans, method = "wss") +
geom_vline(xintercept = 2, linetype = 2, color = "red") +
labs(title = "Elbow Method", subtitle = "Mencari Titik Siku")
Menggunakan k=2 karena hasil diagnosis kanker hanya Ganas dan Jinak.
set.seed(123)
final_kmeans <- kmeans(df_scaled, centers = 2, nstart = 25)
fviz_cluster(final_kmeans, data = df_scaled,
palette = c("#2E9FDF", "#E7B800"),
geom = "point",
ellipse.type = "convex",
ggtheme = theme_minimal())
df$cluster <- as.factor(final_kmeans$cluster)
table(Diagnosis_Asli = df$diagnosis, Cluster_Mesin = df$cluster)
## Cluster_Mesin
## Diagnosis_Asli 1 2
## B 14 343
## M 175 37
Cluster 2 (Kelompok Tumor Jinak/Benign): Berhasil mengidentifikasi 343 pasien yang aslinya Benign.
Cluster 1 (Kelompok Tumor Ganas/Malignant): Berhasil mengelompokkan 175 pasien yang aslinya Malignant.
Akurasinya: \[\text{Akurasi} = \frac{\text{Data yang Benar}}{\text{Total Data}} = \frac{343 + 175}{569} \approx 91\%\]
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(tidyverse)
library(factoextra)
Menentukan jumlah cluster dengan menggunakan fungsi dari flexclust.
set.seed(123)
kmed_steps <- stepFlexclust(df_scaled, k = 2:10, nrep = 5,
family = kccaFamily("kmedians"))
## 2 : * * * * *
## 3 : * * * * *
## 4 : * * * * *
## 5 : * * * * *
## 6 : * * * * *
## 7 : * * * * *
## 8 : * * * * *
## 9 : * * * * *
## 10 : * * * * *
plot(kmed_steps)
set.seed(123)
final_kmedian <- kcca(df_scaled, k = 2, family = kccaFamily("kmedians"))
kmed_clusters <- predict(final_kmedian)
fviz_cluster(list(data = df_scaled, cluster = kmed_clusters),
geom = "point",
ellipse.type = "convex",
palette = c("#2E9FDF", "#E7B800"),
ggtheme = theme_light()) +
labs(title = "Cluster Plot K-Median",
x = "Dimensi 1 (Skala Jarak)",
y = "Dimensi 2 (Skala Jarak)") +
theme(axis.text = element_text(size = 10))
axis.title = element_text(size = 12)
df$cluster_kmedian <- as.factor(predict(final_kmedian))
table(Diagnosis_Asli = df$diagnosis, Cluster_KMedian = df$cluster_kmedian)
## Cluster_KMedian
## Diagnosis_Asli 1 2
## B 12 345
## M 184 28
Berdasarkan hasil pengelompokan menggunakan algoritma K-Median (dengan jarak Manhattan) pada dataset Breast Cancer Wisconsin, dapat ditarik beberapa poin utama:
Cluster 2 (Kelompok Tumor Jinak/Benign): Berhasil mengelompokkan 345 pasien yang aslinya Benign.
Cluster 1 (Kelompok Tumor Ganas/Malignant): Berhasil mengelompokkan 184 pasien yang aslinya Malignant.
Akurasinya: \[\text{Akurasi} = \frac{\text{Data yang Benar}}{\text{Total Data}} = \frac{529 + 40}{569} \approx 93\%\]
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(factoextra)
library(tidyverse)
DBSCAN membutuhkan dua parameter, yaitu MinPts (minimal titik) dan Eps (jarak jangkauan). Cara mencari Eps terbaik adalah dengan K-Nearest Neighbor (KNN) Distance Plot.
dbscan::kNNdistplot(df_scaled, k = 5)
abline(h = 1.5, col = "red", linetype = "dashed")
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "linetype"
## is not a graphical parameter
set.seed(123)
final_dbscan <- dbscan(df_scaled, eps = 1.5, minPts = 5)
print(final_dbscan)
## DBSCAN clustering for 569 objects.
## Parameters: eps = 1.5, minPts = 5
## Using euclidean distances and borderpoints = TRUE
## The clustering contains 1 cluster(s) and 550 noise points.
##
## 0 1
## 550 19
##
## Available fields: cluster, eps, minPts, metric, borderPoints
fviz_cluster(final_dbscan, data = df_scaled,
geom = "point",
palette = "jco",
ggtheme = theme_minimal(),
main = "Visualisasi Cluster DBSCAN")
df$cluster_dbscan <- as.factor(final_dbscan$cluster)
table(Diagnosis_Asli = df$diagnosis, Cluster_DBSCAN = df$cluster_dbscan)
## Cluster_DBSCAN
## Diagnosis_Asli 0 1
## B 338 19
## M 212 0
Berdasarkan hasil pengelompokan menggunakan algoritma DBSCAN pada dataset Breast Cancer Wisconsin, dapat ditarik beberapa poin utama:
Cluster 1 (Kelompok Homogen): Berisi 19 pasien yang semuanya memiliki diagnosis Benign (Jinak).
Cluster 0 (Noise/Outlier): Sebanyak 550 data (338 Benign dan 212 Malignant) dianggap sebagai noise oleh algoritma.
Akurasi: \[\text{Akurasi} = \frac{19}{569} \times 100\% \approx \mathbf{3,3\%}\]
library(meanShiftR)
library(factoextra)
library(tidyverse)
Pada algoritma Mean Shift, parameter utama yang digunakan adalah bandwidth, yaitu radius untuk menentukan kedekatan antar data dalam proses pembentukan cluster.
Pada percobaan awal menggunakan: - bandwidth 1, dihasilkan jumlah cluster yang sangat banyak (overclustering), karena bandwidth terlalu kecil sehingga setiap titik data membentuk cluster sendiri. Oleh karena itu, dilakukan penyesuaian bandwidth menjadi:
bandwidth <- rep(4.3, ncol(df_scaled))
Nilai bandwidth yang lebih besar membuat area pencarian lebih luas, mengurangi jumlah cluster yang terbentuk, disesuaikan dengan tujuan analisis yaitu mendekati jumlah kelas asli (2 kelas: Benign & Malignant), serta hasil eksperimen menunjukkan bahwa bandwidth 4.3 menghasilkan 2 cluster, sehingga lebih representatif.
set.seed(123)
mean_shift_result <- meanShift(as.matrix(df_scaled), bandwidth = bandwidth)
clusters_ms <- mean_shift_result$assignment
fviz_cluster(list(data = df_scaled, cluster = clusters_ms),
geom = "point",
ellipse.type = "convex",
ggtheme = theme_minimal()) +
labs(title = "Visualisasi Cluster Mean Shift")
df$cluster_meanshift <- as.factor(clusters_ms)
table(Diagnosis_Asli = df$diagnosis, Cluster_MeanShift = df$cluster_meanshift)
## Cluster_MeanShift
## Diagnosis_Asli 1 2
## B 357 0
## M 211 1
Berdasarkan hasil pengelompokan menggunakan algoritma Mean Shift pada dataset Breast Cancer Wisconsin, diperoleh hasil sebagai berikut:
Cluster 1 (Dominan Benign): Berhasil mengelompokkan 357 pasien Benign, namun juga mencampurkan 211 pasien Malignant ke dalam cluster ini.
Cluster 2 (Sangat kecil): Hanya berisi 1 pasien Malignant.
Akurasi \[\text{Akurasi} = \frac{358}{569} \times 100\% \approx \mathbf{62,9\%}\]
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
##
## Attaching package: 'e1071'
## The following object is masked from 'package:flexclust':
##
## bclust
## The following object is masked from 'package:ggplot2':
##
## element
library(factoextra)
library(tidyverse)
Fuzzy C-Means membutuhkan parameter centers (jumlah cluster). Karena tujuan kita memisahkan Benign dan Malignant, kita set centers = 2.
c <- 2
set.seed(123)
fcm_result <- cmeans(df_scaled, centers = 2, m = 2, iter.max = 100)
clusters_fcm <- apply(fcm_result$membership, 1, which.max)
fviz_cluster(list(data = df_scaled, cluster = clusters_fcm),
geom = "point",
ellipse.type = "convex",
palette = c("#2E9FDF", "#E7B800"),
ggtheme = theme_minimal()) +
labs(title = "Visualisasi Cluster Fuzzy C-Means")
df$cluster_fcm <- as.factor(clusters_fcm)
table(Diagnosis_Asli = df$diagnosis, Cluster_FCM = df$cluster_fcm)
## Cluster_FCM
## Diagnosis_Asli 1 2
## B 18 339
## M 181 31
Berdasarkan hasil pengelompokan menggunakan algoritma Fuzzy C-Means pada dataset Breast Cancer Wisconsin, diperoleh hasil sebagai berikut:
Cluster 1 (18 B + 181 M): didominasi Malignant, tapi sebagian besar data Benign salah masuk.
Cluster 2 (339 B + 31 M): didominasi Benign, sebagian kecil Malignant salah masuk.
FCM membagi data sesuai kemiripan fitur, namun karena banyak overlap antara Benign dan Malignant, beberapa data “tersesat” ke cluster lain.
Perbandingan akurasi seluruh algoritma untuk menentukan metode paling optimal.
algoritma <- c("K-Medians", "Fuzzy C-Means", "K-Means", "Mean Shift", "DBSCAN")
benign_benar <- c(345, 339, 343, 357, 19)
malignant_benar <- c(184, 181, 175, 1, 0)
tabel_evaluasi <- data.frame(
Algoritma = algoritma,
Benign_Benar = benign_benar,
Malignant_Benar = malignant_benar,
Akurasi = round(((benign_benar + malignant_benar) / 569) * 100, 1)
)
knitr::kable(tabel_evaluasi, caption = "Perbandingan Akurasi Algoritma")
| Algoritma | Benign_Benar | Malignant_Benar | Akurasi |
|---|---|---|---|
| K-Medians | 345 | 184 | 93.0 |
| Fuzzy C-Means | 339 | 181 | 91.4 |
| K-Means | 343 | 175 | 91.0 |
| Mean Shift | 357 | 1 | 62.9 |
| DBSCAN | 19 | 0 | 3.3 |