Load Library
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:ggplot2':
##
## element
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
##
## Attaching package: 'fpc'
##
## The following object is masked from 'package:dbscan':
##
## dbscan
library(mclust)
## Warning: package 'mclust' was built under R version 4.5.3
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
##
## The following object is masked from 'package:dplyr':
##
## count
##
## The following object is masked from 'package:purrr':
##
## map
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(dplyr)
library(psych)
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:mclust':
##
## sim
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
Pre Processing
df_raw <- read_excel("D:/.tugas semester 4 dika/analisis multivariat/Dry_Bean_Dataset.xlsx")
head(df_raw,5)
## # A tibble: 5 × 17
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 28395 610. 208. 174. 1.20 0.550
## 2 28734 638. 201. 183. 1.10 0.412
## 3 29380 624. 213. 176. 1.21 0.563
## 4 30008 646. 211. 183. 1.15 0.499
## 5 30140 620. 202. 190. 1.06 0.334
## # ℹ 11 more variables: ConvexArea <dbl>, EquivDiameter <dbl>, Extent <dbl>,
## # Solidity <dbl>, roundness <dbl>, Compactness <dbl>, ShapeFactor1 <dbl>,
## # ShapeFactor2 <dbl>, ShapeFactor3 <dbl>, ShapeFactor4 <dbl>, Class <chr>
# melihat misiing value
colSums(is.na(df_raw))
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation
## 0 0 0 0 0
## Eccentricity ConvexArea EquivDiameter Extent Solidity
## 0 0 0 0 0
## roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3
## 0 0 0 0 0
## ShapeFactor4 Class
## 0 0
true_labels <- as.numeric(as.factor(df_raw$Class))
# hapus class
df <- df_raw[,-17]
Statistika Deskriptif
df_describe <- describe(df)
df_describe <- select(df_describe,mean,median,sd,min,max,skew,kurtosis)
df_describe
## mean median sd min max skew kurtosis
## Area 53048.28 44652.00 29324.10 20420.00 254616.00 2.95 10.79
## Perimeter 855.28 794.94 214.29 524.74 1985.37 1.63 3.59
## MajorAxisLength 320.14 296.88 85.69 183.60 738.86 1.36 2.53
## MinorAxisLength 202.27 192.43 44.97 122.51 460.20 2.24 6.65
## AspectRation 1.58 1.55 0.25 1.02 2.43 0.58 0.11
## Eccentricity 0.75 0.76 0.09 0.22 0.91 -1.06 1.39
## ConvexArea 53768.20 45178.00 29774.92 20684.00 263261.00 2.94 10.74
## EquivDiameter 253.06 238.44 59.18 161.24 569.37 1.95 5.19
## Extent 0.75 0.76 0.05 0.56 0.87 -0.90 0.64
## Solidity 0.99 0.99 0.00 0.92 0.99 -2.55 12.79
## roundness 0.87 0.88 0.06 0.49 0.99 -0.64 0.37
## Compactness 0.80 0.80 0.06 0.64 0.99 0.04 -0.22
## ShapeFactor1 0.01 0.01 0.00 0.00 0.01 -0.53 0.71
## ShapeFactor2 0.00 0.00 0.00 0.00 0.00 0.30 -0.86
## ShapeFactor3 0.64 0.64 0.10 0.41 0.97 0.24 -0.15
## ShapeFactor4 1.00 1.00 0.00 0.95 1.00 -2.76 13.03
Normalisasi data
df_scaled = scale(df)
head(df_scaled,5)
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation
## [1,] -0.8407176 -1.1432769 -1.306550 -0.6311299 -1.564995
## [2,] -0.8291572 -1.0138866 -1.395860 -0.4344286 -1.969712
## [3,] -0.8071275 -1.0787894 -1.252311 -0.5857131 -1.514236
## [4,] -0.7857117 -0.9771793 -1.278778 -0.4392741 -1.741554
## [5,] -0.7812103 -1.0973438 -1.380420 -0.2666536 -2.117915
## Eccentricity ConvexArea EquivDiameter Extent Solidity roundness
## [1,] -2.185640 -0.8414197 -1.0633015 0.2890768 0.3675999 1.4238148
## [2,] -3.685904 -0.8260712 -1.0441784 0.6974512 -0.4628896 0.2310455
## [3,] -2.045261 -0.8086740 -1.0080470 0.5781740 0.5183978 1.2528189
## [4,] -2.742110 -0.7739468 -0.9733011 0.6712350 -2.2416847 0.5150303
## [5,] -4.534862 -0.7842575 -0.9660443 0.4760028 0.8047429 1.8749235
## Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## [1,] 1.839049 0.6807614 2.402085 1.925653 0.8383402
## [2,] 2.495358 0.3679534 3.100779 2.689603 0.7711101
## [3,] 1.764778 0.6031067 2.235009 1.841288 0.9167215
## [4,] 2.081639 0.4017030 2.514982 2.204169 -0.1979782
## [5,] 2.765229 0.1182639 3.270862 3.013352 0.9396054
Uji Asumsi
set.seed(42)
hop <- get_clust_tendency(df_scaled, n = 100, graph = FALSE)
cat("Hopkins Statistic :", round(hop$hopkins_stat, 4), "\n")
## Hopkins Statistic : 0.9474
cat("Interpretasi: nilai mendekati 1 = kecenderungan clustering sangat kuat")
## Interpretasi: nilai mendekati 1 = kecenderungan clustering sangat kuat
Penentuan Jumlah Cluster
avg_sil <- function(k) {
km_res <- kmeans(df, centers = k, nstart = 25)
ss <- silhouette(km_res$cluster, dist(df))
mean(ss[, 3])
}
k_values <- 2:10
avg_sil_values <- sapply(k_values, avg_sil)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 680550)
plot(k_values, avg_sil_values, type = "b", pch = 19, frame = FALSE,
xlab = "Jumlah Cluster K",
ylab = "Average Silhouette Width",
main = "Silhouette Analysis")

set.seed(42)
idx_sub <- sample(nrow(df_scaled), 3000)
X_sub <- df_scaled[idx_sub, ]
p_elbow <- fviz_nbclust(X_sub, kmeans, method = "wss", k.max = 10,
linecolor = "#2E4057") +
labs(title = "Elbow Method - K-Means",
x = "Jumlah Cluster (k)", y = "Total Within SS") +
theme_minimal(base_size = 11)
p_elbow

p_sil_opt <- fviz_nbclust(X_sub, kmeans, method = "silhouette", k.max = 10,
linecolor = "#E63946") +
labs(title = "Silhouette Method - K-Means",
x = "Jumlah Cluster (k)", y = "Avg Silhouette Width") +
theme_minimal(base_size = 11)
p_sil_opt

KMeans
set.seed(42)
kmeans_model <- kmeans(df_scaled, centers = 3, nstart = 25, iter.max = 100)
kmeans_model$centers
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation
## 1 4.0995827 3.4036023 3.1805982 3.8214703 0.008355137
## 2 0.3435942 0.5800852 0.6631724 0.2387072 0.840607254
## 3 -0.5330283 -0.6567305 -0.7016786 -0.4376004 -0.611293172
## Eccentricity ConvexArea EquivDiameter Extent Solidity roundness
## 1 0.2120395 4.0914883 3.6425032 0.5453134 -0.05539501 -0.1514141
## 2 0.7496530 0.3471842 0.4857851 -0.3156127 -0.50538526 -0.9235397
## 3 -0.5592939 -0.5350770 -0.6047348 0.1916009 0.37100047 0.6814346
## Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## 1 -0.1162200 -2.7650519 -1.4611164 -0.1442470 -0.7385019
## 2 -0.8314546 -0.3704941 -0.8854059 -0.8206257 -0.5670336
## 3 0.6121002 0.4603164 0.7442685 0.6061703 0.4630117
fviz_cluster(kmeans_model,
data = df_scaled,
geom = "point",
ellipse.type = "covex",
main = "Kmeans Clustering")

DBSCAN
kNNdistplot(df_scaled, k = 5)
abline(h = 1.5, col = "red", lty = 2)

dbscan_model <- dbscan(df_scaled, eps = 1.5, MinPts = 5)
dbscan_model
## dbscan Pts=13611 MinPts=5 eps=1.5
## 0 1 2 3
## border 160 103 34 4
## seed 0 12842 466 2
## total 160 12945 500 6
table(dbscan_model$cluster)
##
## 0 1 2 3
## 160 12945 500 6
fviz_cluster(list(data = df_scaled,
cluster = dbscan_model$cluster),
geom = "point", main = "DBSCAN Clustering")

Mean Shift
set.seed(42)
sample_idx <- sample(nrow(df_scaled),500)
df_sample <- df_scaled[sample_idx,]
ms_model <- meanShift(df_sample, bandwidth = rep(3, ncol(df_sample)))
table(ms_model$assignment)
##
## 1 2 3 4 5 6 7 8 9
## 229 134 120 7 2 4 1 1 2
pca <- prcomp(df_sample)
plot_df <-data.frame(pca$x[,1:2], cluster = as.factor(ms_model$assignment))
ggplot(plot_df, aes(PC1, PC2, color = cluster)) +
geom_point(alpha = 0.6) +
labs(title = "Mean Shift Clustering") +
theme_minimal()

Fuzzy C-Means
set.seed(42)
fcm_model <- cmeans(df_scaled, iter.max = 100,centers = 3, m=2,method = 'cmeans')
fcm_model$centers
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation
## 1 -0.5141789 -0.6686900 -0.75107620 -0.3577163 -0.8246493
## 2 -0.1570902 -0.1218142 -0.05827683 -0.2321242 0.2623228
## 3 0.8238365 0.9959984 1.03021763 0.7556609 0.6726727
## Eccentricity ConvexArea EquivDiameter Extent Solidity roundness
## 1 -0.8622697 -0.5165727 -0.5989192 0.29669609 0.44852716 0.8617650
## 2 0.3572704 -0.1574839 -0.1388292 -0.21134992 0.01798371 -0.1380381
## 3 0.6541931 0.8267620 0.9442543 -0.06437514 -0.49446907 -0.8153546
## Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## 1 0.8721061 0.3623836 0.9701448 0.8800172 0.536835915
## 2 -0.3074674 0.2211812 -0.2203736 -0.3217982 0.005687373
## 3 -0.6991012 -0.8403228 -0.9751984 -0.6981295 -0.667067584
fcm_model$size
## [1] 5191 4225 4195
hard_clusters <- apply(fcm_model$membership, 1, which.max)
table(hard_clusters)
## hard_clusters
## 1 2 3
## 5191 4225 4195
pca2 <- prcomp(df_scaled)
plot_df2 <- data.frame(pc1 = pca2$x[,1], pc2 = pca2$x[,2],
cluster = as.factor(hard_clusters))
ggplot(plot_df2, aes(pc1, pc2, color = cluster)) +
geom_point(alpha = 0.6) + labs(title = "Fuzzy C-Means Clustering") +theme_minimal()

Matrics
set.seed(42)
idx <- sample(nrow(df_scaled),1000)
sub <- df_scaled[idx,]
silhouette_kmeans <- silhouette(kmeans_model$cluster[idx], dist(sub))
silhouette_kmedian <- silhouette(kmedian_model$cluster[idx], dist(sub))
silhouette_dbscan <- silhouette(dbscan_model$cluster[idx], dist(sub))
silhouette_fuzzy <- silhouette(fcm_model$cluster[idx], dist(sub))
silhoutette_ms <- silhouette(as.integer(ms_model$assignment), dist(sample_idx))
cat("K-Means :", mean(silhouette_kmeans[,3]), "\n")
## K-Means : 0.4035393
cat("K-Median :", mean(silhouette_kmedian[,3]), "\n")
## K-Median : 0.2901268
cat("DBSCAN :", mean(silhouette_dbscan[,3]), "\n")
## DBSCAN : 0.4180372
cat("Mean Shift :", mean(silhoutette_ms[,3]), "\n")
## Mean Shift : -0.2156215
cat("Fuzzy CMeans:", mean(silhouette_fuzzy[,3]), "\n")
## Fuzzy CMeans: 0.2263632
dunn_km <- cluster.stats(dist(sub), kmeans_model$cluster[idx])$dunn
dunn_kmed <- cluster.stats(dist(sub), kmedian_model$cluster[idx])$dunn
dunn_DBSCAN <- cluster.stats(dist(sub), dbscan_model$cluster[idx])$dunn
## Warning in cluster.stats(dist(sub), dbscan_model$cluster[idx]): clustering
## renumbered because maximum != number of clusters
dunn_ms <- cluster.stats(dist(sample_idx), as.integer(ms_model$assignment))$dunn
dunn_fcm <- cluster.stats(dist(sub), fcm_model$cluster[idx])$dunn
cat("K-Means :", round(dunn_km, 4), "\n")
## K-Means : 0.0256
cat("K-Median :", round(dunn_kmed, 4), "\n")
## K-Median : 0.0226
cat("DBSCAN :", round(dunn_DBSCAN, 4), "\n")
## DBSCAN : 0.1252
cat("Mean Shift :", round(dunn_ms, 4), "\n")
## Mean Shift : 1e-04
cat("Fuzzy C-Means:", round(dunn_fcm, 4), "\n")
## Fuzzy C-Means: 0.0157
ari_km <- adjustedRandIndex(kmeans_model$cluster, true_labels)
ari_kmed <- adjustedRandIndex(kmedian_model$cluster, true_labels)
ari_db <- adjustedRandIndex(dbscan_model$cluster, true_labels)
ari_ms <- adjustedRandIndex(as.integer(ms_model$assignment), true_labels[sample_idx])
ari_fcm <- adjustedRandIndex(fcm_model$cluster, true_labels)
cat("K-Means :", round(ari_km, 4), "\n")
## K-Means : 0.2985
cat("K-Median :", round(ari_kmed, 4), "\n")
## K-Median : 0.3924
cat("DBSCAN :", round(ari_db, 4), "\n")
## DBSCAN : 0.0335
cat("Mean Shift :", round(ari_ms, 4), "\n")
## Mean Shift : 0.341
cat("Fuzzy C-Means:", round(ari_fcm, 4), "\n")
## Fuzzy C-Means: 0.317