library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flexclust)
library(dbscan)
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(meanShiftR)
library(e1071)
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:flexclust':
##
## bclust
##
## The following object is masked from 'package:ggplot2':
##
## element
library(cluster)
library(fpc)
##
## Attaching package: 'fpc'
##
## The following object is masked from 'package:dbscan':
##
## dbscan
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(readxl)
Import Dataset
df <- read_excel("Dry_Bean_Dataset.xlsx")
head(df)
## # A tibble: 6 × 17
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 28395 610. 208. 174. 1.20 0.550
## 2 28734 638. 201. 183. 1.10 0.412
## 3 29380 624. 213. 176. 1.21 0.563
## 4 30008 646. 211. 183. 1.15 0.499
## 5 30140 620. 202. 190. 1.06 0.334
## 6 30279 635. 213. 182. 1.17 0.520
## # ℹ 11 more variables: ConvexArea <dbl>, EquivDiameter <dbl>, Extent <dbl>,
## # Solidity <dbl>, roundness <dbl>, Compactness <dbl>, ShapeFactor1 <dbl>,
## # ShapeFactor2 <dbl>, ShapeFactor3 <dbl>, ShapeFactor4 <dbl>, Class <chr>Deskripsi Data
describe(df)
## vars n mean sd median trimmed mad
## Area 1 13611 53048.28 29324.10 44652.00 48011.59 15398.28
## Perimeter 2 13611 855.28 214.29 794.94 828.15 174.76
## MajorAxisLength 3 13611 320.14 85.69 296.88 310.31 81.50
## MinorAxisLength 4 13611 202.27 44.97 192.43 195.85 28.26
## AspectRation 5 13611 1.58 0.25 1.55 1.57 0.20
## Eccentricity 6 13611 0.75 0.09 0.76 0.76 0.07
## ConvexArea 7 13611 53768.20 29774.92 45178.00 48652.37 15665.15
## EquivDiameter 8 13611 253.06 59.18 238.44 245.18 42.02
## Extent 9 13611 0.75 0.05 0.76 0.75 0.05
## Solidity 10 13611 0.99 0.00 0.99 0.99 0.00
## roundness 11 13611 0.87 0.06 0.88 0.88 0.06
## Compactness 12 13611 0.80 0.06 0.80 0.80 0.05
## ShapeFactor1 13 13611 0.01 0.00 0.01 0.01 0.00
## ShapeFactor2 14 13611 0.00 0.00 0.00 0.00 0.00
## ShapeFactor3 15 13611 0.64 0.10 0.64 0.64 0.08
## ShapeFactor4 16 13611 1.00 0.00 1.00 1.00 0.00
## Class* 17 13611 4.53 1.83 4.00 4.66 1.48
## min max range skew kurtosis se
## Area 20420.00 254616.00 234196.00 2.95 10.79 251.35
## Perimeter 524.74 1985.37 1460.63 1.63 3.59 1.84
## MajorAxisLength 183.60 738.86 555.26 1.36 2.53 0.73
## MinorAxisLength 122.51 460.20 337.69 2.24 6.65 0.39
## AspectRation 1.02 2.43 1.41 0.58 0.11 0.00
## Eccentricity 0.22 0.91 0.69 -1.06 1.39 0.00
## ConvexArea 20684.00 263261.00 242577.00 2.94 10.74 255.21
## EquivDiameter 161.24 569.37 408.13 1.95 5.19 0.51
## Extent 0.56 0.87 0.31 -0.90 0.64 0.00
## Solidity 0.92 0.99 0.08 -2.55 12.79 0.00
## roundness 0.49 0.99 0.50 -0.64 0.37 0.00
## Compactness 0.64 0.99 0.35 0.04 -0.22 0.00
## ShapeFactor1 0.00 0.01 0.01 -0.53 0.71 0.00
## ShapeFactor2 0.00 0.00 0.00 0.30 -0.86 0.00
## ShapeFactor3 0.41 0.97 0.56 0.24 -0.15 0.00
## ShapeFactor4 0.95 1.00 0.05 -2.76 13.03 0.00
## Class* 1.00 7.00 6.00 -0.32 -0.74 0.02Seleksi Variabel
df <- df %>%
select(-matches("class", ignore.case = TRUE))Cek Missing Value
colSums(is.na(df))
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation
## 0 0 0 0 0
## Eccentricity ConvexArea EquivDiameter Extent Solidity
## 0 0 0 0 0
## roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3
## 0 0 0 0 0
## ShapeFactor4
## 0Tabel Missing Value
missing_table <- df %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "missing_values")
missing_table
## # A tibble: 16 × 2
## variable missing_values
## <chr> <int>
## 1 Area 0
## 2 Perimeter 0
## 3 MajorAxisLength 0
## 4 MinorAxisLength 0
## 5 AspectRation 0
## 6 Eccentricity 0
## 7 ConvexArea 0
## 8 EquivDiameter 0
## 9 Extent 0
## 10 Solidity 0
## 11 roundness 0
## 12 Compactness 0
## 13 ShapeFactor1 0
## 14 ShapeFactor2 0
## 15 ShapeFactor3 0
## 16 ShapeFactor4 0
Handling Missing Value
df <- df %>%
mutate(across(everything(),
~ifelse(is.na(.), median(., na.rm = TRUE), .)))Normalisasi Data
df_scaled <- scale(df)
summary(df_scaled)
## Area Perimeter MajorAxisLength MinorAxisLength
## Min. :-1.1127 Min. :-1.5425 Min. :-1.5933 Min. :-1.7736
## 1st Qu.:-0.5702 1st Qu.:-0.7082 1st Qu.:-0.7800 1st Qu.:-0.5876
## Median :-0.2863 Median :-0.2816 Median :-0.2714 Median :-0.2188
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.2825 3rd Qu.: 0.5690 3rd Qu.: 0.6576 3rd Qu.: 0.3282
## Max. : 6.8738 Max. : 5.2736 Max. : 4.8862 Max. : 5.7355
## AspectRation Eccentricity ConvexArea EquivDiameter
## Min. :-2.2636 Min. :-5.7819 Min. :-1.1111 Min. :-1.5516
## 1st Qu.:-0.6119 1st Qu.:-0.3801 1st Qu.:-0.5728 1st Qu.:-0.6421
## Median :-0.1302 Median : 0.1472 Median :-0.2885 Median :-0.2472
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5021 3rd Qu.: 0.6475 3rd Qu.: 0.2863 3rd Qu.: 0.4458
## Max. : 3.4339 Max. : 1.7448 Max. : 7.0359 Max. : 5.3451
## Extent Solidity roundness Compactness
## Min. :-3.9607 Min. :-14.5689 Min. :-6.4460 Min. :-2.5811
## 1st Qu.:-0.6336 1st Qu.: -0.3159 1st Qu.:-0.6920 1st Qu.:-0.6059
## Median : 0.2063 Median : 0.2446 Median : 0.1659 Median : 0.0229
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7562 3rd Qu.: 0.6159 3rd Qu.: 0.7323 3rd Qu.: 0.5575
## Max. : 2.3726 Max. : 1.6167 Max. : 1.9725 Max. : 3.0373
## ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## Min. :-3.35603 Min. :-1.93292 Min. :-2.35617 Min. :-10.8500
## 1st Qu.:-0.58838 1st Qu.:-0.94387 1st Qu.:-0.62863 1st Qu.: -0.3116
## Median : 0.07231 Median :-0.03762 Median :-0.01562 Median : 0.3029
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.62749 3rd Qu.: 0.76244 3rd Qu.: 0.52948 3rd Qu.: 0.6457
## Max. : 3.44642 Max. : 3.27086 Max. : 3.34535 Max. : 1.0693Sampling Data
set.seed(123)
df_sample <- df_scaled[sample(1:nrow(df_scaled), 500), ]
cat("Jumlah data:", nrow(df_sample), "\n")
## Jumlah data: 500
cat("Jumlah variabel:", ncol(df_sample))
## Jumlah variabel: 16Elbow Method
wss <- sapply(1:10, function(k){
kmeans(df_sample, centers = k, nstart = 20)$tot.withinss
})
plot(1:10, wss, type="b", pch=19,
xlab="Jumlah Cluster",
ylab="WSS",
main="Elbow Method")
Silhouette Method
avg_sil <- function(k){
km <- kmeans(df_sample, centers = k, nstart = 25)
ss <- silhouette(km$cluster, dist(df_sample))
mean(ss[,3])
}
k_values <- 2:10
sil_values <- sapply(k_values, avg_sil)
## Warning: did not converge in 10 iterations
plot(k_values, sil_values, type="b", pch=19,
xlab="Jumlah Cluster",
ylab="Silhouette",
main="Silhouette Method")
K Optimal
k <- k_values[which.max(sil_values)]
k
## [1] 2K-Means
km_res <- kmeans(df_sample, centers = k, nstart = 25)
km_res$centers # pusat cluster
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation
## 1 -0.4950036 -0.5888755 -0.6133563 -0.4398724 -0.4479450
## 2 0.7557036 0.9223304 0.9618134 0.6498142 0.7542531
## Eccentricity ConvexArea EquivDiameter Extent Solidity roundness
## 1 -0.3346927 -0.4963798 -0.5533037 0.1825227 0.2348752 0.5035383
## 2 0.6994287 0.7595342 0.8529195 -0.1871576 -0.5860549 -0.9101130
## Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## 1 0.4201909 0.4683987 0.5561958 0.4058195 0.3671445
## 2 -0.7629544 -0.6861022 -0.9764030 -0.7567268 -0.6758116
km_res$cluster # hasil cluster tiap data
## [1] 2 1 1 1 1 2 1 1 2 1 1 1 1 2 2 1 2 2 1 1 1 2 1 2 2 2 2 2 1 2 1 1 2 1 1 1 1
## [38] 2 1 1 2 2 1 2 1 2 1 1 1 1 1 1 2 1 1 2 1 2 2 2 1 1 1 1 1 2 2 2 1 1 2 2 1 1
## [75] 1 1 1 1 1 1 2 1 2 2 1 2 2 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 1 2 2 1 1 2 1 1 1
## [112] 1 1 1 1 2 2 1 2 1 1 1 2 2 2 1 2 1 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2
## [149] 2 1 2 1 2 1 1 1 2 1 1 1 1 2 2 2 1 2 1 1 2 1 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1
## [186] 1 1 1 2 2 1 1 1 2 1 2 2 1 1 1 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1
## [223] 2 1 1 2 2 1 1 1 2 1 2 2 1 2 2 2 1 1 2 2 2 2 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2
## [260] 2 1 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 2 1 1 1
## [297] 1 2 1 1 1 2 1 1 1 1 2 1 1 2 2 2 2 2 1 2 1 1 1 2 1 1 2 2 1 2 2 1 1 1 2 1 2
## [334] 1 2 1 1 1 2 2 1 2 2 2 2 2 2 1 2 2 1 1 1 1 2 1 1 2 2 1 2 1 1 1 1 1 1 1 2 2
## [371] 2 2 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 2 1 2 1 2 1 1 2 2 1 1 2 1 1 2 2 2 2
## [408] 1 1 2 2 2 1 2 2 1 1 1 1 2 1 1 1 2 2 2 2 1 1 2 2 2 1 1 2 2 2 1 2 2 1 1 2 1
## [445] 1 1 1 2 1 1 1 1 1 2 2 1 2 1 2 1 1 1 1 2 1 2 2 1 1 2 2 1 2 1 2 1 2 1 1 2 1
## [482] 1 1 2 1 1 1 2 2 1 1 2 1 1 1 1 1 2 2 1
km_res$tot.withinss # total variasi dalam cluster
## [1] 4623.542K-Median
kmed_res <- kcca(df_sample, k = k, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
clusters(kmed_res) # hasil cluster
## [1] 2 2 1 1 1 2 1 1 2 1 1 1 1 2 2 1 2 2 1 1 1 2 1 2 2 2 2 2 1 2 1 1 2 2 1 1 1
## [38] 2 1 1 2 2 1 2 1 2 1 1 1 1 1 1 2 1 1 2 1 2 2 2 1 1 1 1 1 2 2 2 1 1 2 2 1 1
## [75] 1 1 1 1 2 1 2 1 2 2 1 2 2 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 1 2 2 1 1 2 1 1 1
## [112] 1 1 1 1 2 2 1 2 1 1 1 2 2 2 1 2 1 2 1 1 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2
## [149] 2 1 2 1 2 1 2 1 2 1 1 1 1 2 2 2 1 2 1 1 2 1 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1
## [186] 1 2 1 2 2 1 1 1 2 1 2 2 1 1 1 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1
## [223] 2 1 1 2 2 1 1 1 2 1 2 2 1 2 2 2 1 1 2 2 2 2 2 1 1 1 1 1 1 1 2 2 2 2 2 2 2
## [260] 2 1 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 2 1 1 1
## [297] 1 2 1 2 1 2 1 1 1 1 2 1 1 2 2 2 2 2 1 2 1 1 1 2 1 1 2 2 1 2 2 1 1 1 2 1 2
## [334] 1 2 1 1 1 2 2 1 2 2 2 2 2 2 1 2 2 1 1 1 1 2 1 1 2 2 1 2 1 1 1 1 1 1 1 2 2
## [371] 2 2 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 2 1 2 1 2 1 1 2 2 1 1 2 1 1 2 2 2 2
## [408] 1 1 2 2 2 1 2 2 1 1 2 1 2 1 1 1 2 2 2 2 2 1 2 2 2 1 1 2 2 2 1 2 2 1 1 2 1
## [445] 1 1 1 2 1 1 1 1 1 2 2 1 2 1 2 1 1 1 1 2 1 2 2 1 1 2 2 1 2 1 2 1 2 1 1 2 1
## [482] 1 1 2 1 1 1 2 2 1 1 2 1 1 1 1 1 2 2 1
parameters(kmed_res) # pusat cluster
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation
## [1,] -0.4993942 -0.6061815 -0.6580358 -0.3956746 -0.3932004
## [2,] 0.4267724 0.6406680 0.7529436 0.5218694 0.5780800
## Eccentricity ConvexArea EquivDiameter Extent Solidity roundness
## [1,] -0.1206780 -0.5001257 -0.5396756 0.31241754 0.3566702 0.5535950
## [2,] 0.6970405 0.4333782 0.6059833 -0.01362707 -0.2264801 -0.8240637
## Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## [1,] 0.3113895 0.3268573 0.4735203 0.2757972 0.4879563
## [2,] -0.6801757 -0.7722140 -1.0016005 -0.6989811 -0.4369522
Jumlah anggota tiap cluster
table(km_res$cluster)
##
## 1 2
## 301 199kNN Distance Plot
kNNdistplot(df_sample, k = 5)
abline(h = 1.5, col = "red", lwd = 2)
title("kNN Distance Plot")
DBSCAN
db_res <- dbscan::dbscan(df_sample, eps = 0.8, minPts = 5)
table(db_res$cluster)
##
## 0 1 2 3 4 5 6 7
## 292 143 34 6 6 8 3 8PCA + Mean Shift
df_pca <- prcomp(df_sample)$x[,1:2]
ms_res <- meanShiftR::meanShift(
as.matrix(df_pca),
bandwidth = c(1.5, 1.5)
)
ms_cluster <- ms_res$assignment
table(ms_cluster)
## ms_cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 5 159 27 8 55 2 7 5 1 6 3 4 4 7 5 40 4 10 3
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 2 4 3 4 4 3 3 2 3 3 5 1 1 1 7 4 3 1 1 2
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 3 1 1 2 4 3 3 1 1 7 2 4 1 5 1 3 1 2 2 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 4 2 2 3 1 1 2 2 2 1 1 1 1 3 1 2 2 1 1 1
## 81 82 83 84 85
## 1 1 1 1 1Fuzzy C-Means
fcm_res <- cmeans(df_sample, centers = k, m = 2)
head(fcm_res$membership)
## 1 2
## [1,] 0.2365864 0.7634136
## [2,] 0.5007008 0.4992992
## [3,] 0.8354073 0.1645927
## [4,] 0.6152742 0.3847258
## [5,] 0.8145180 0.1854820
## [6,] 0.2253815 0.7746185Distance
dist_sample <- dist(df_sample)
dist_pca <- dist(df_pca)Silhouette
sil_km <- mean(silhouette(km_res$cluster, dist_sample)[,3])
sil_kmed <- mean(silhouette(clusters(kmed_res), dist_sample)[,3])
sil_fcm <- mean(silhouette(fcm_res$cluster, dist_sample)[,3])Silhouette DBSCAN
db_cluster <- db_res$cluster
valid <- db_cluster != 0
if(length(unique(db_cluster[valid])) > 1){
d <- as.matrix(dist(df_sample))
sil_db <- mean(silhouette(db_cluster[valid], d[valid, valid])[,3])
} else {
sil_db <- NA
}Silhouette Mean Shift
if(length(unique(ms_cluster)) > 1){
sil_ms <- mean(silhouette(ms_cluster, dist_pca)[,3])
} else {
sil_ms <- NA
}Dunn Index
dunn_km <- cluster.stats(dist_sample, km_res$cluster)$dunn
dunn_kmed <- cluster.stats(dist_sample, clusters(kmed_res))$dunn
dunn_fcm <- cluster.stats(dist_sample, fcm_res$cluster)$dunnDunn DBSCAN
if(length(unique(db_cluster[valid])) > 1){
dunn_db <- cluster.stats(dist(df_sample[valid,]), db_cluster[valid])$dunn
} else {
dunn_db <- NA
}Dunn Mean Shift
if(length(unique(ms_cluster)) > 1){
dunn_ms <- cluster.stats(dist_pca, ms_cluster)$dunn
} else {
dunn_ms <- NA
}Tabel Evaluasi
evaluation_table <- data.frame(
Method = c("K-Means", "K-Median", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
Silhouette = c(sil_km, sil_kmed, sil_db, sil_ms, sil_fcm),
Dunn_Index = c(dunn_km, dunn_kmed, dunn_db, dunn_ms, dunn_fcm)
)
evaluation_table
## Method Silhouette Dunn_Index
## 1 K-Means 0.37757267 0.041532335
## 2 K-Median 0.36990395 0.030220443
## 3 DBSCAN 0.11091725 0.130895545
## 4 Mean Shift -0.09059167 0.006812513
## 5 Fuzzy C-Means 0.37288334 0.034605573Output Evaluasi
cat("SILHOUETTE\n")
## SILHOUETTE
cat("K-Means :", sil_km, "\n")
## K-Means : 0.3775727
cat("K-Median :", sil_kmed, "\n")
## K-Median : 0.369904
cat("DBSCAN :", sil_db, "\n")
## DBSCAN : 0.1109172
cat("Mean Shift :", sil_ms, "\n")
## Mean Shift : -0.09059167
cat("Fuzzy C-Means :", sil_fcm, "\n")
## Fuzzy C-Means : 0.3728833
cat("\nDUNN INDEX\n")
##
## DUNN INDEX
cat("K-Means :", dunn_km, "\n")
## K-Means : 0.04153233
cat("K-Median :", dunn_kmed, "\n")
## K-Median : 0.03022044
cat("DBSCAN :", dunn_db, "\n")
## DBSCAN : 0.1308955
cat("Mean Shift :", dunn_ms, "\n")
## Mean Shift : 0.006812513
cat("Fuzzy C-Means :", dunn_fcm, "\n")
## Fuzzy C-Means : 0.03460557Interpretasi K-Means
df_result <- as.data.frame(df_sample)
df_result$Cluster_KMeans <- km_res$cluster
aggregate(df_result,
by=list(df_result$Cluster_KMeans),
mean)
## Group.1 Area Perimeter MajorAxisLength MinorAxisLength AspectRation
## 1 1 -0.4950036 -0.5888755 -0.6133563 -0.4398724 -0.4479450
## 2 2 0.7557036 0.9223304 0.9618134 0.6498142 0.7542531
## Eccentricity ConvexArea EquivDiameter Extent Solidity roundness
## 1 -0.3346927 -0.4963798 -0.5533037 0.1825227 0.2348752 0.5035383
## 2 0.6994287 0.7595342 0.8529195 -0.1871576 -0.5860549 -0.9101130
## Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## 1 0.4201909 0.4683987 0.5561958 0.4058195 0.3671445
## 2 -0.7629544 -0.6861022 -0.9764030 -0.7567268 -0.6758116
## Cluster_KMeans
## 1 1
## 2 2