Load Library

library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:ggplot2':
## 
##     element
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
## 
## Attaching package: 'fpc'
## 
## The following object is masked from 'package:dbscan':
## 
##     dbscan
library(mclust)
## Warning: package 'mclust' was built under R version 4.5.3
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## The following object is masked from 'package:purrr':
## 
##     map
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(dplyr)
library(psych)
## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:mclust':
## 
##     sim
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

Pre Processing

df_raw <- read_excel("D:/.tugas semester 4 dika/analisis multivariat/Dry_Bean_Dataset.xlsx")
head(df_raw,5)
## # A tibble: 5 × 17
##    Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
##   <dbl>     <dbl>           <dbl>           <dbl>        <dbl>        <dbl>
## 1 28395      610.            208.            174.         1.20        0.550
## 2 28734      638.            201.            183.         1.10        0.412
## 3 29380      624.            213.            176.         1.21        0.563
## 4 30008      646.            211.            183.         1.15        0.499
## 5 30140      620.            202.            190.         1.06        0.334
## # ℹ 11 more variables: ConvexArea <dbl>, EquivDiameter <dbl>, Extent <dbl>,
## #   Solidity <dbl>, roundness <dbl>, Compactness <dbl>, ShapeFactor1 <dbl>,
## #   ShapeFactor2 <dbl>, ShapeFactor3 <dbl>, ShapeFactor4 <dbl>, Class <chr>
# melihat misiing value
colSums(is.na(df_raw))
##            Area       Perimeter MajorAxisLength MinorAxisLength    AspectRation 
##               0               0               0               0               0 
##    Eccentricity      ConvexArea   EquivDiameter          Extent        Solidity 
##               0               0               0               0               0 
##       roundness     Compactness    ShapeFactor1    ShapeFactor2    ShapeFactor3 
##               0               0               0               0               0 
##    ShapeFactor4           Class 
##               0               0
true_labels <- as.numeric(as.factor(df_raw$Class))
# hapus class
df <- df_raw[,-17]

Statistika Deskriptif

df_describe <- describe(df)
df_describe <- select(df_describe,mean,median,sd,min,max,skew,kurtosis)
df_describe
##                     mean   median       sd      min       max  skew kurtosis
## Area            53048.28 44652.00 29324.10 20420.00 254616.00  2.95    10.79
## Perimeter         855.28   794.94   214.29   524.74   1985.37  1.63     3.59
## MajorAxisLength   320.14   296.88    85.69   183.60    738.86  1.36     2.53
## MinorAxisLength   202.27   192.43    44.97   122.51    460.20  2.24     6.65
## AspectRation        1.58     1.55     0.25     1.02      2.43  0.58     0.11
## Eccentricity        0.75     0.76     0.09     0.22      0.91 -1.06     1.39
## ConvexArea      53768.20 45178.00 29774.92 20684.00 263261.00  2.94    10.74
## EquivDiameter     253.06   238.44    59.18   161.24    569.37  1.95     5.19
## Extent              0.75     0.76     0.05     0.56      0.87 -0.90     0.64
## Solidity            0.99     0.99     0.00     0.92      0.99 -2.55    12.79
## roundness           0.87     0.88     0.06     0.49      0.99 -0.64     0.37
## Compactness         0.80     0.80     0.06     0.64      0.99  0.04    -0.22
## ShapeFactor1        0.01     0.01     0.00     0.00      0.01 -0.53     0.71
## ShapeFactor2        0.00     0.00     0.00     0.00      0.00  0.30    -0.86
## ShapeFactor3        0.64     0.64     0.10     0.41      0.97  0.24    -0.15
## ShapeFactor4        1.00     1.00     0.00     0.95      1.00 -2.76    13.03

Normalisasi data

df_scaled = scale(df)
head(df_scaled,5)
##            Area  Perimeter MajorAxisLength MinorAxisLength AspectRation
## [1,] -0.8407176 -1.1432769       -1.306550      -0.6311299    -1.564995
## [2,] -0.8291572 -1.0138866       -1.395860      -0.4344286    -1.969712
## [3,] -0.8071275 -1.0787894       -1.252311      -0.5857131    -1.514236
## [4,] -0.7857117 -0.9771793       -1.278778      -0.4392741    -1.741554
## [5,] -0.7812103 -1.0973438       -1.380420      -0.2666536    -2.117915
##      Eccentricity ConvexArea EquivDiameter    Extent   Solidity roundness
## [1,]    -2.185640 -0.8414197    -1.0633015 0.2890768  0.3675999 1.4238148
## [2,]    -3.685904 -0.8260712    -1.0441784 0.6974512 -0.4628896 0.2310455
## [3,]    -2.045261 -0.8086740    -1.0080470 0.5781740  0.5183978 1.2528189
## [4,]    -2.742110 -0.7739468    -0.9733011 0.6712350 -2.2416847 0.5150303
## [5,]    -4.534862 -0.7842575    -0.9660443 0.4760028  0.8047429 1.8749235
##      Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## [1,]    1.839049    0.6807614     2.402085     1.925653    0.8383402
## [2,]    2.495358    0.3679534     3.100779     2.689603    0.7711101
## [3,]    1.764778    0.6031067     2.235009     1.841288    0.9167215
## [4,]    2.081639    0.4017030     2.514982     2.204169   -0.1979782
## [5,]    2.765229    0.1182639     3.270862     3.013352    0.9396054

Uji Asumsi

set.seed(42)
hop <- get_clust_tendency(df_scaled, n = 100, graph = FALSE)
cat("Hopkins Statistic  :", round(hop$hopkins_stat, 4), "\n")
## Hopkins Statistic  : 0.9474
cat("Interpretasi: nilai mendekati 1 = kecenderungan clustering sangat kuat")
## Interpretasi: nilai mendekati 1 = kecenderungan clustering sangat kuat

Penentuan Jumlah Cluster

avg_sil <- function(k) {
  km_res <- kmeans(df, centers = k, nstart = 25)
  ss <- silhouette(km_res$cluster, dist(df))
  mean(ss[, 3])
}
k_values <- 2:10
avg_sil_values <- sapply(k_values, avg_sil)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 680550)
plot(k_values, avg_sil_values, type = "b", pch = 19, frame = FALSE,
     xlab = "Jumlah Cluster K",
     ylab = "Average Silhouette Width",
     main = "Silhouette Analysis")

set.seed(42)
idx_sub <- sample(nrow(df_scaled), 3000)
X_sub   <- df_scaled[idx_sub, ]

p_elbow <- fviz_nbclust(X_sub, kmeans, method = "wss", k.max = 10,
                        linecolor = "#2E4057") +
  labs(title = "Elbow Method - K-Means",
       x = "Jumlah Cluster (k)", y = "Total Within SS") +
  theme_minimal(base_size = 11)

p_elbow

p_sil_opt <- fviz_nbclust(X_sub, kmeans, method = "silhouette", k.max = 10,
                          linecolor = "#E63946") +
  labs(title = "Silhouette Method - K-Means",
       x = "Jumlah Cluster (k)", y = "Avg Silhouette Width") +
  theme_minimal(base_size = 11)

p_sil_opt

KMeans

set.seed(42)
kmeans_model <- kmeans(df_scaled, centers = 3, nstart = 25, iter.max = 100)
kmeans_model$centers
##         Area  Perimeter MajorAxisLength MinorAxisLength AspectRation
## 1  4.0995827  3.4036023       3.1805982       3.8214703  0.008355137
## 2  0.3435942  0.5800852       0.6631724       0.2387072  0.840607254
## 3 -0.5330283 -0.6567305      -0.7016786      -0.4376004 -0.611293172
##   Eccentricity ConvexArea EquivDiameter     Extent    Solidity  roundness
## 1    0.2120395  4.0914883     3.6425032  0.5453134 -0.05539501 -0.1514141
## 2    0.7496530  0.3471842     0.4857851 -0.3156127 -0.50538526 -0.9235397
## 3   -0.5592939 -0.5350770    -0.6047348  0.1916009  0.37100047  0.6814346
##   Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## 1  -0.1162200   -2.7650519   -1.4611164   -0.1442470   -0.7385019
## 2  -0.8314546   -0.3704941   -0.8854059   -0.8206257   -0.5670336
## 3   0.6121002    0.4603164    0.7442685    0.6061703    0.4630117
fviz_cluster(kmeans_model,
             data = df_scaled,
             geom = "point", 
             ellipse.type = "covex",
             main = "Kmeans Clustering")

Kmedian

set.seed(42)
kmedian_model <- pam(df_scaled,k=3,metric = "manhattan")

kmedian_model$medoids
##            Area  Perimeter MajorAxisLength MinorAxisLength AspectRation
## [1,] -0.5090109 -0.6760636      -0.8546221      -0.1208822   -1.3331544
## [2,] -0.5119778 -0.5981877      -0.5598435      -0.5277487   -0.2384576
## [3,]  0.5933590  0.8046329       0.8959801       0.5589540    0.6574747
##      Eccentricity ConvexArea EquivDiameter      Extent  Solidity  roundness
## [1,]  -1.59982865 -0.5134255    -0.5534202 0.511437546 0.7569936  1.2758647
## [2,]   0.04213999 -0.5145338    -0.5576708 0.004941598 0.4539063  0.5174616
## [3,]   0.74685886  0.5896507     0.7846048 0.305764206 0.1204467 -0.5896924
##      Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## [1,]   1.4978589  -0.07702422    1.3706631    1.5415975    0.8423911
## [2,]   0.1409080   0.52489270    0.2863858    0.1028167    0.3559944
## [3,]  -0.7343709  -0.82389431   -0.9891199   -0.7500784   -0.3046418
fviz_cluster(kmedian_model,
             data= df_scaled,
             ellipse.type = "covex",
             geom = "point",
             main = "K-Median")

DBSCAN

kNNdistplot(df_scaled, k = 5)
abline(h = 1.5, col = "red", lty = 2)

dbscan_model <- dbscan(df_scaled, eps = 1.5, MinPts = 5)
dbscan_model
## dbscan Pts=13611 MinPts=5 eps=1.5
##          0     1   2 3
## border 160   103  34 4
## seed     0 12842 466 2
## total  160 12945 500 6
table(dbscan_model$cluster)
## 
##     0     1     2     3 
##   160 12945   500     6
fviz_cluster(list(data = df_scaled, 
                  cluster = dbscan_model$cluster), 
             
             
             geom = "point", main = "DBSCAN Clustering")

Mean Shift

set.seed(42)
sample_idx <- sample(nrow(df_scaled),500)
df_sample <- df_scaled[sample_idx,]
ms_model <- meanShift(df_sample, bandwidth = rep(3, ncol(df_sample)))
table(ms_model$assignment)
## 
##   1   2   3   4   5   6   7   8   9 
## 229 134 120   7   2   4   1   1   2
pca <- prcomp(df_sample)
plot_df <-data.frame(pca$x[,1:2], cluster = as.factor(ms_model$assignment))
ggplot(plot_df, aes(PC1, PC2, color = cluster)) +
  geom_point(alpha = 0.6) +
  labs(title = "Mean Shift Clustering") +
  theme_minimal()

Fuzzy C-Means

set.seed(42)
fcm_model <- cmeans(df_scaled, iter.max = 100,centers = 3, m=2,method = 'cmeans')
fcm_model$centers
##         Area  Perimeter MajorAxisLength MinorAxisLength AspectRation
## 1 -0.5141789 -0.6686900     -0.75107620      -0.3577163   -0.8246493
## 2 -0.1570902 -0.1218142     -0.05827683      -0.2321242    0.2623228
## 3  0.8238365  0.9959984      1.03021763       0.7556609    0.6726727
##   Eccentricity ConvexArea EquivDiameter      Extent    Solidity  roundness
## 1   -0.8622697 -0.5165727    -0.5989192  0.29669609  0.44852716  0.8617650
## 2    0.3572704 -0.1574839    -0.1388292 -0.21134992  0.01798371 -0.1380381
## 3    0.6541931  0.8267620     0.9442543 -0.06437514 -0.49446907 -0.8153546
##   Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## 1   0.8721061    0.3623836    0.9701448    0.8800172  0.536835915
## 2  -0.3074674    0.2211812   -0.2203736   -0.3217982  0.005687373
## 3  -0.6991012   -0.8403228   -0.9751984   -0.6981295 -0.667067584
fcm_model$size
## [1] 5191 4225 4195
hard_clusters <- apply(fcm_model$membership, 1, which.max)
table(hard_clusters)
## hard_clusters
##    1    2    3 
## 5191 4225 4195
pca2 <- prcomp(df_scaled)
plot_df2 <- data.frame(pc1 = pca2$x[,1], pc2 = pca2$x[,2], 
                       cluster = as.factor(hard_clusters))

ggplot(plot_df2, aes(pc1, pc2, color = cluster)) +
  geom_point(alpha = 0.6) + labs(title = "Fuzzy C-Means Clustering") +theme_minimal()

Matrics

set.seed(42)
idx <- sample(nrow(df_scaled),1000)
sub <- df_scaled[idx,]
silhouette_kmeans <- silhouette(kmeans_model$cluster[idx], dist(sub))
silhouette_kmedian <- silhouette(kmedian_model$cluster[idx], dist(sub))
silhouette_dbscan <- silhouette(dbscan_model$cluster[idx], dist(sub))
silhouette_fuzzy <- silhouette(fcm_model$cluster[idx], dist(sub))
silhoutette_ms <- silhouette(as.integer(ms_model$assignment), dist(sample_idx))
cat("K-Means    :", mean(silhouette_kmeans[,3]),   "\n")
## K-Means    : 0.4035393
cat("K-Median  :", mean(silhouette_kmedian[,3]), "\n")
## K-Median  : 0.2901268
cat("DBSCAN     :", mean(silhouette_dbscan[,3]),   "\n")
## DBSCAN     : 0.4180372
cat("Mean Shift :", mean(silhoutette_ms[,3]),    "\n")
## Mean Shift : -0.2156215
cat("Fuzzy CMeans:", mean(silhouette_fuzzy[,3]),     "\n")
## Fuzzy CMeans: 0.2263632
dunn_km   <- cluster.stats(dist(sub), kmeans_model$cluster[idx])$dunn
dunn_kmed <- cluster.stats(dist(sub), kmedian_model$cluster[idx])$dunn
dunn_DBSCAN <- cluster.stats(dist(sub), dbscan_model$cluster[idx])$dunn
## Warning in cluster.stats(dist(sub), dbscan_model$cluster[idx]): clustering
## renumbered because maximum != number of clusters
dunn_ms <- cluster.stats(dist(sample_idx), as.integer(ms_model$assignment))$dunn
dunn_fcm  <- cluster.stats(dist(sub), fcm_model$cluster[idx])$dunn


cat("K-Means      :", round(dunn_km,   4), "\n")
## K-Means      : 0.0256
cat("K-Median     :", round(dunn_kmed, 4), "\n")
## K-Median     : 0.0226
cat("DBSCAN       :", round(dunn_DBSCAN, 4), "\n")
## DBSCAN       : 0.1252
cat("Mean Shift    :", round(dunn_ms, 4), "\n")
## Mean Shift    : 1e-04
cat("Fuzzy C-Means:", round(dunn_fcm,  4), "\n")
## Fuzzy C-Means: 0.0157
ari_km   <- adjustedRandIndex(kmeans_model$cluster,   true_labels)
ari_kmed <- adjustedRandIndex(kmedian_model$cluster, true_labels)
ari_db   <- adjustedRandIndex(dbscan_model$cluster,   true_labels)
ari_ms   <- adjustedRandIndex(as.integer(ms_model$assignment), true_labels[sample_idx])
ari_fcm  <- adjustedRandIndex(fcm_model$cluster,        true_labels)

cat("K-Means      :", round(ari_km,   4), "\n")
## K-Means      : 0.2985
cat("K-Median     :", round(ari_kmed, 4), "\n")
## K-Median     : 0.3924
cat("DBSCAN       :", round(ari_db,   4), "\n")
## DBSCAN       : 0.0335
cat("Mean Shift    :", round(ari_ms,   4), "\n")
## Mean Shift    : 0.341
cat("Fuzzy C-Means:", round(ari_fcm,  4), "\n")
## Fuzzy C-Means: 0.317