Package

packages <- c(
  "readxl",       
  "dplyr",        
  "tidyr",        
  "ggplot2",      
  "factoextra", 
  "cluster",     
  "clusterSim",   
  "fclust",     
  "dbscan",       
  "meanShiftR",   
  "gridExtra",    
  "scales",       
  "ggpubr",
  "mclust",
  "moments",
  "flexclust",
  "e1071"
)

installed <- packages %in% rownames(installed.packages())
if (any(!installed)) {
  install.packages(packages[!installed])
}

lapply(packages, library, character.only = TRUE)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Registered S3 method overwritten by 'fclust':
##   method       from 
##   print.fclust e1071
## 
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:flexclust':
## 
##     bclust
## The following objects are masked from 'package:moments':
## 
##     kurtosis, moment, skewness
## The following object is masked from 'package:ggplot2':
## 
##     element
## [[1]]
## [1] "readxl"    "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [7] "methods"   "base"     
## 
## [[2]]
## [1] "dplyr"     "readxl"    "stats"     "graphics"  "grDevices" "utils"    
## [7] "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "tidyr"     "dplyr"     "readxl"    "stats"     "graphics"  "grDevices"
##  [7] "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "ggplot2"   "tidyr"     "dplyr"     "readxl"    "stats"     "graphics" 
##  [7] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[5]]
##  [1] "factoextra" "ggplot2"    "tidyr"      "dplyr"      "readxl"    
##  [6] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [11] "methods"    "base"      
## 
## [[6]]
##  [1] "cluster"    "factoextra" "ggplot2"    "tidyr"      "dplyr"     
##  [6] "readxl"     "stats"      "graphics"   "grDevices"  "utils"     
## [11] "datasets"   "methods"    "base"      
## 
## [[7]]
##  [1] "clusterSim" "MASS"       "cluster"    "factoextra" "ggplot2"   
##  [6] "tidyr"      "dplyr"      "readxl"     "stats"      "graphics"  
## [11] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[8]]
##  [1] "fclust"     "clusterSim" "MASS"       "cluster"    "factoextra"
##  [6] "ggplot2"    "tidyr"      "dplyr"      "readxl"     "stats"     
## [11] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [16] "base"      
## 
## [[9]]
##  [1] "dbscan"     "fclust"     "clusterSim" "MASS"       "cluster"   
##  [6] "factoextra" "ggplot2"    "tidyr"      "dplyr"      "readxl"    
## [11] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [16] "methods"    "base"      
## 
## [[10]]
##  [1] "meanShiftR" "dbscan"     "fclust"     "clusterSim" "MASS"      
##  [6] "cluster"    "factoextra" "ggplot2"    "tidyr"      "dplyr"     
## [11] "readxl"     "stats"      "graphics"   "grDevices"  "utils"     
## [16] "datasets"   "methods"    "base"      
## 
## [[11]]
##  [1] "gridExtra"  "meanShiftR" "dbscan"     "fclust"     "clusterSim"
##  [6] "MASS"       "cluster"    "factoextra" "ggplot2"    "tidyr"     
## [11] "dplyr"      "readxl"     "stats"      "graphics"   "grDevices" 
## [16] "utils"      "datasets"   "methods"    "base"      
## 
## [[12]]
##  [1] "scales"     "gridExtra"  "meanShiftR" "dbscan"     "fclust"    
##  [6] "clusterSim" "MASS"       "cluster"    "factoextra" "ggplot2"   
## [11] "tidyr"      "dplyr"      "readxl"     "stats"      "graphics"  
## [16] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[13]]
##  [1] "ggpubr"     "scales"     "gridExtra"  "meanShiftR" "dbscan"    
##  [6] "fclust"     "clusterSim" "MASS"       "cluster"    "factoextra"
## [11] "ggplot2"    "tidyr"      "dplyr"      "readxl"     "stats"     
## [16] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [21] "base"      
## 
## [[14]]
##  [1] "mclust"     "ggpubr"     "scales"     "gridExtra"  "meanShiftR"
##  [6] "dbscan"     "fclust"     "clusterSim" "MASS"       "cluster"   
## [11] "factoextra" "ggplot2"    "tidyr"      "dplyr"      "readxl"    
## [16] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [21] "methods"    "base"      
## 
## [[15]]
##  [1] "moments"    "mclust"     "ggpubr"     "scales"     "gridExtra" 
##  [6] "meanShiftR" "dbscan"     "fclust"     "clusterSim" "MASS"      
## [11] "cluster"    "factoextra" "ggplot2"    "tidyr"      "dplyr"     
## [16] "readxl"     "stats"      "graphics"   "grDevices"  "utils"     
## [21] "datasets"   "methods"    "base"      
## 
## [[16]]
##  [1] "flexclust"  "moments"    "mclust"     "ggpubr"     "scales"    
##  [6] "gridExtra"  "meanShiftR" "dbscan"     "fclust"     "clusterSim"
## [11] "MASS"       "cluster"    "factoextra" "ggplot2"    "tidyr"     
## [16] "dplyr"      "readxl"     "stats"      "graphics"   "grDevices" 
## [21] "utils"      "datasets"   "methods"    "base"      
## 
## [[17]]
##  [1] "e1071"      "flexclust"  "moments"    "mclust"     "ggpubr"    
##  [6] "scales"     "gridExtra"  "meanShiftR" "dbscan"     "fclust"    
## [11] "clusterSim" "MASS"       "cluster"    "factoextra" "ggplot2"   
## [16] "tidyr"      "dplyr"      "readxl"     "stats"      "graphics"  
## [21] "grDevices"  "utils"      "datasets"   "methods"    "base"

Load Data

data <- read.csv("Acoustic Features.csv", sep=",", header=TRUE)
X <- data[,c(2:51)]
# Convert class labels to numeric factors
data$Class_Numeric <- as.numeric(as.factor(data$Class))

# Show mapping
class_mapping <- data.frame(
  Original_Class = levels(as.factor(data$Class)),
  Numeric_Code = 1:length(levels(as.factor(data$Class)))
)

cat("Class Mapping (Numeric Encoding):\n")
## Class Mapping (Numeric Encoding):
print(class_mapping)
##   Original_Class Numeric_Code
## 1          angry            1
## 2          happy            2
## 3          relax            3
## 4            sad            4
# View first few rows
cat("\nFirst 10 rows:\n")
## 
## First 10 rows:
print(head(data[, c("Class", "Class_Numeric")], 10))
##    Class Class_Numeric
## 1  relax             3
## 2  relax             3
## 3  relax             3
## 4  relax             3
## 5  relax             3
## 6  relax             3
## 7  relax             3
## 8  relax             3
## 9  relax             3
## 10 relax             3

EDA

Sebaran data

tabel_deskriptif <- X %>%
  summarise(across(everything(), list(
    n        = ~sum(!is.na(.)),
    Mean     = ~mean(., na.rm = TRUE),
    Median   = ~median(., na.rm = TRUE),
    SD       = ~sd(., na.rm = TRUE),
    Min      = ~min(., na.rm = TRUE),
    Max      = ~max(., na.rm = TRUE),
    Skewness = ~moments::skewness(., na.rm = TRUE),
    Kurtosis = ~moments::kurtosis(., na.rm = TRUE)
  ))) %>%
  pivot_longer(
    cols = everything(),
    names_to = c("Variable", ".value"),
    names_pattern = "(.*)_(.*)"
  )

print(tabel_deskriptif, n = 50)
## # A tibble: 50 × 9
##    Variable        n     Mean   Median      SD     Min     Max Skewness Kurtosis
##    <chr>       <int>    <dbl>    <dbl>   <dbl>   <dbl>   <dbl>    <dbl>    <dbl>
##  1 X_RMSenerg…   400  1.35e-1  1.28e-1 6.44e-2   0.01  4.31e-1   0.703      3.60
##  2 X_Lowenerg…   400  5.54e-1  5.53e-1 5.07e-2   0.302 7.03e-1  -0.386      5.13
##  3 X_Fluctuat…   400  7.15e+0  6.73e+0 2.28e+0   3.58  2.35e+1   2.88      15.6 
##  4 X_Tempo_Me…   400  1.24e+2  1.20e+2 3.42e+1  48.3   1.95e+2   0.116      2.37
##  5 X_MFCC_Mea…   400  2.46e+0  2.39e+0 7.99e-1   0.323 6.00e+0   0.865      4.91
##  6 X_MFCC_Mea…   400  7.19e-2  6.85e-2 5.38e-1  -3.48  1.94e+0  -0.677      7.83
##  7 X_MFCC_Mea…   400  4.88e-1  4.65e-1 2.95e-1  -0.87  1.62e+0   0.111      4.16
##  8 X_MFCC_Mea…   400  3.05e-2  4.45e-2 2.76e-1  -1.64  1.13e+0  -0.695      7.01
##  9 X_MFCC_Mea…   400  1.79e-1  1.81e-1 1.95e-1  -0.494 1.06e+0   0.0748     4.16
## 10 X_MFCC_Mea…   400  3.83e-2  4.95e-2 2.04e-1  -0.916 7.99e-1  -0.283      5.53
## 11 X_MFCC_Mea…   400  5.99e-2  7.2 e-2 1.81e-1  -0.936 5.71e-1  -0.790      6.30
## 12 X_MFCC_Mea…   400  4.35e-2  3.95e-2 1.65e-1  -0.744 7.28e-1  -0.107      5.81
## 13 X_MFCC_Mea…   400  2.30e-2  1.65e-2 1.59e-1  -0.621 5.39e-1  -0.129      4.26
## 14 X_MFCC_Mea…   400  2.78e-2  3.15e-2 1.52e-1  -0.544 5.1 e-1  -0.246      4.05
## 15 X_MFCC_Mea…   400  2.88e-2  3.7 e-2 1.36e-1  -0.487 4.94e-1  -0.297      3.88
## 16 X_MFCC_Mea…   400  1.67e-2  2.25e-2 1.29e-1  -0.418 3.55e-1  -0.350      3.38
## 17 X_MFCC_Mea…   400  2.41e-2  3.9 e-2 1.33e-1  -0.62  5.36e-1  -0.493      4.98
## 18 X_Roughnes…   400  5.28e+2  3.68e+2 5.21e+2   0.941 3.90e+3   2.08       9.63
## 19 X_Roughnes…   400  7.20e-2  6.8 e-2 1.74e-1  -0.525 5.84e-1  -0.0234     3.40
## 20 X_Zero.cro…   400  9.97e+2  8.93e+2 5.25e+2 149.    3.15e+3   0.923      3.66
## 21 X_AttackTi…   400  3.13e-2  2.7 e-2 1.68e-2   0.01  1.65e-1   3.37      19.9 
## 22 X_AttackTi…   400 -2.89e-3  7.5 e-3 1.50e-1  -0.465 5.99e-1  -0.0114     3.45
## 23 X_Rolloff_…   400  5.69e+3  5.65e+3 2.29e+3 887.    1.15e+4   0.0959     2.38
## 24 X_Eventden…   400  2.78e+0  2.77e+0 1.33e+0   0.234 7.95e+0   0.474      3.12
## 25 X_Pulsecla…   400  2.49e-1  2.18e-1 1.55e-1   0.011 8.56e-1   1.16       4.18
## 26 X_Brightne…   400  4.34e-1  4.48e-1 1.32e-1   0.053 7.37e-1  -0.398      2.91
## 27 X_Spectral…   400  2.58e+3  2.55e+3 8.64e+2 607.    5.33e+3   0.224      2.85
## 28 X_Spectral…   400  3.08e+3  3.15e+3 7.68e+2 815.    4.72e+3  -0.259      2.56
## 29 X_Spectral…   400  1.87e+0  1.69e+0 8.82e-1   0.39  7.86e+0   2.27      12.0 
## 30 X_Spectral…   400  7.35e+0  5.22e+0 8.62e+0   1.93  1.22e+2   7.94      91.5 
## 31 X_Spectral…   400  4.85e-2  4.7 e-2 2.65e-2   0.006 2.09e-1   1.48       8.09
## 32 X_Entropyo…   400  8.73e-1  8.79e-1 3.73e-2   0.74  9.42e-1  -0.972      3.93
## 33 X_Chromagr…   400  3.53e-1  2.74e-1 3.23e-1   0     1   e+0   0.700      2.27
## 34 X_Chromagr…   400  2.53e-1  1.42e-1 2.88e-1   0     1   e+0   1.20       3.41
## 35 X_Chromagr…   400  3.65e-1  2.88e-1 3.25e-1   0     1   e+0   0.691      2.27
## 36 X_Chromagr…   400  2.08e-1  1.05e-1 2.54e-1   0     1   e+0   1.48       4.44
## 37 X_Chromagr…   400  3.50e-1  2.71e-1 3.04e-1   0     1   e+0   0.781      2.53
## 38 X_Chromagr…   400  2.64e-1  1.44e-1 2.93e-1   0     1   e+0   1.11       3.17
## 39 X_Chromagr…   400  2.43e-1  1.41e-1 2.76e-1   0     1   e+0   1.33       3.90
## 40 X_Chromagr…   400  3.92e-1  2.95e-1 3.31e-1   0     1   e+0   0.570      2.04
## 41 X_Chromagr…   400  3.55e-1  2.47e-1 3.35e-1   0     1   e+0   0.759      2.22
## 42 X_Chromagr…   400  5.91e-1  6.12e-1 3.58e-1   0     1   e+0  -0.218      1.56
## 43 X_Chromagr…   400  3.42e-1  2.47e-1 3.16e-1   0     1   e+0   0.711      2.30
## 44 X_Chromagr…   400  3.86e-1  2.96e-1 3.48e-1   0     1   e+0   0.546      1.88
## 45 X_Harmonic…   400  3.28e-1  3.33e-1 5.55e-2   0.112 4.88e-1  -0.464      3.47
## 46 X_Harmonic…   400  1.93e-1  1.9 e-1 4.71e-2   0.06  3.4 e-1   0.244      2.87
## 47 X_Harmonic…   400 -1.58e-4 -2   e-3 1.05e-1  -0.285 4.42e-1   0.200      3.90
## 48 X_Harmonic…   400  1.76e+0  1.68e+0 9.30e-1   0.187 4.49e+0   0.397      2.70
## 49 X_Harmonic…   400  7.70e-1  7.86e-1 7.21e-2   0.53  9.08e-1  -0.761      3.16
## 50 X_Harmonic…   400  9.67e-1  9.67e-1 3.84e-3   0.939 9.77e-1  -1.48      10.5

Standarisasi

X_scaled <- scale(X)

PCA

pca_result <- prcomp(X_scaled, center = FALSE, scale. = FALSE)

var_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)
n_pc_80 <- which(cumsum(var_explained) >= 0.80)
X_pca <- pca_result$x[, 1:n_pc_80]
## Warning in 1:n_pc_80: numerical expression has 29 elements: only the first used
cat("Original dimensions:", ncol(X_scaled), "\n")
## Original dimensions: 50
cat("PCA dimensions :", ncol(X_pca), "\n")
## PCA dimensions : 22

Hopkins Statistic (Original vs PCA)

set.seed(42)
hop_original <- get_clust_tendency(X_scaled, n = 100, graph = FALSE)
## Warning: Hopkins statistic uses the corrected formula (Wright 2022); results
## differ from legacy factoextra. Set options(factoextra.warn_hopkins = FALSE) to
## silence this warning.
cat("Hopkins Statistic (Original):", round(hop_original$hopkins_stat, 4), "\n")
## Hopkins Statistic (Original): 1
hop_pca <- get_clust_tendency(X_pca, n = 100, graph = FALSE)
cat("Hopkins Statistic (PCA)     :", round(hop_pca$hopkins_stat, 4), "\n")
## Hopkins Statistic (PCA)     : 1

Menentukan eps DBSCAN (PCA Data)

kNNdistplot(X_pca, k = 23)
abline(h = 8, col = "red", lty = 2)

Clustering Model

K-Means

Menentukan jumlah cluster optimal (Elbow & Silhouette)

set.seed(42)

# Elbow Method
fviz_nbclust(X_pca, kmeans, method = "wss") +
  labs(title = "Elbow Method untuk Optimal k")

# Silhouette Method
fviz_nbclust(X_pca, kmeans, method = "silhouette") +
  labs(title = "Silhouette Method untuk Optimal k")

# Berdasarkan hasil eksplorasi, dipilih k optimal = 2
k_optimal <- 2

set.seed(42)
kmeans_res <- kmeans(X_pca, centers = k_optimal, nstart = 25)

kmeans_res
## K-means clustering with 2 clusters of sizes 231, 169
## 
## Cluster means:
##         PC1         PC2        PC3         PC4         PC5         PC6
## 1  1.936972 -0.05734675 -0.1078317 -0.04573734 -0.02410952 -0.07261830
## 2 -2.647577  0.07838521  0.1473912  0.06251672  0.03295443  0.09925933
##           PC7         PC8         PC9        PC10        PC11        PC12
## 1  0.05040918  0.03321966 -0.01913624 -0.03111916 -0.03432697 -0.02435921
## 2 -0.06890249 -0.04540676  0.02615663  0.04253565  0.04692029  0.03329573
##           PC13        PC14        PC15        PC16        PC17        PC18
## 1 -0.008972421  0.02160526  0.02161278 -0.03202798 -0.02217016  0.04579152
## 2  0.012264079 -0.02953144 -0.02954172  0.04377790  0.03030359 -0.06259078
##          PC19        PC20        PC21        PC22
## 1  0.01398905  0.08423306  0.09094821  0.05117816
## 2 -0.01912113 -0.11513513 -0.12431382 -0.06995358
## 
## Clustering vector:
##   [1] 2 2 1 2 2 2 2 2 2 2 2 1 2 2 1 1 2 1 2 1 1 1 2 1 1 2 2 2 1 2 2 2 2 2 2 2 1
##  [38] 1 1 2 2 1 2 1 1 2 2 1 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 1 1 1 1 1 2
##  [75] 1 2 2 2 1 1 2 1 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 2 1 2 2 1 1 2 1 2 2 2 1
## [223] 1 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 1 2 2 1 2 2 1 2 2 2 2 2 1 1 2 2 1 2 2 1
## [260] 1 2 2 2 2 1 1 1 2 2 2 1 2 2 2 2 2 2 2 1 1 1 1 2 2 2 1 2 2 1 2 2 2 1 2 1 1
## [297] 2 2 1 2 1 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 2 2 1 1 1 1 1 2 1 1 2 1 1 2
## [334] 2 2 1 2 1 1 2 1 1 2 1 1 1 1 2 1 2 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 2 2 2 2
## [371] 1 1 2 2 2 1 2 1 1 2 1 1 1 2 1 2 1 2 2 1 2 1 1 2 1 1 1 1 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 7309.459 6766.697
##  (between_SS / total_SS =  12.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
table(Predicted = kmeans_res$cluster, Actual = data$Class)
##          Actual
## Predicted angry happy relax sad
##         1    63    98    35  35
##         2    37     2    65  65
ari_kmeans <- adjustedRandIndex(kmeans_res$cluster, data$Class_Numeric)
cat("K-Means ARI Score (PCA):", round(ari_kmeans, 4), "\n")
## K-Means ARI Score (PCA): 0.1294

Visualisasi hasil clustering pada PCA (PC1 vs PC2)

pca_kmeans_viz <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  Cluster = as.factor(kmeans_res$cluster)
)

ggplot(pca_kmeans_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(alpha = 0.6, size = 2) +
  labs(title = paste("K-Means Clustering (k =", k_optimal, ") in PCA Space"),
       x = "PC1", y = "PC2") +
  theme_minimal()

K-Medians

df <- as.data.frame(X_pca)

set.seed(42)
kmed_res <- kcca(df, k = 3, family = kccaFamily("kmedians"))

kmed_res
## kcca object of family 'kmedians' 
## 
## call:
## kcca(x = df, k = 3, family = kccaFamily("kmedians"))
## 
## cluster sizes:
## 
##   1   2   3 
## 135 104 161
kmed_cluster <- as.vector(clusters(kmed_res))

table(kmed_cluster)
## kmed_cluster
##   1   2   3 
## 135 104 161
table(Predicted = kmed_cluster, Actual = data$Class)
##          Actual
## Predicted angry happy relax sad
##         1    53     9    29  44
##         2     6     0    59  39
##         3    41    91    12  17
ari_kmed <- adjustedRandIndex(kmed_cluster, data$Class_Numeric)
cat("K-Medians ARI Score (PCA):", round(ari_kmed, 4), "\n")
## K-Medians ARI Score (PCA): 0.2135

Visualisasi clustering di PCA space

pca_kmed_viz <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  Cluster = as.factor(kmed_cluster)
)

ggplot(pca_kmed_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(alpha = 0.6, size = 2) +
  labs(title = "K-Medians Clustering (k = 3) in PCA Space",
       x = "PC1", y = "PC2") +
  theme_minimal()

Evaluasi Silhouette Score

sil_kmed <- silhouette(kmed_cluster, dist(df))
avg_sil_kmed <- mean(sil_kmed[, 3])
cat("Average Silhouette Score (K-Medians):", round(avg_sil_kmed, 4), "\n")
## Average Silhouette Score (K-Medians): 0.0825

DBSCAN on PCA Data

set.seed(42)
db_res <- dbscan(X_pca, eps = 12, minPts = 23)
db_res
## DBSCAN clustering for 400 objects.
## Parameters: eps = 12, minPts = 23
## Using euclidean distances and borderpoints = TRUE
## The clustering contains 1 cluster(s) and 1 noise points.
## 
##   0   1 
##   1 399 
## 
## Available fields: cluster, eps, minPts, metric, borderPoints
db_res$cluster
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
table(Predicted = db_res$cluster, Actual = data$Class)
##          Actual
## Predicted angry happy relax sad
##         0     0     0     0   1
##         1   100   100   100  99
ari_dbscan <- adjustedRandIndex(db_res$cluster, data$Class_Numeric)
ari_dbscan
## [1] 0

DBSCAN gagal melakukan clustering pada 4 kelas. DBSCAN hanya menangkap 1 cluster. Ini terjadi karena nilai yang berdekatan yang dapat dilihat pada gambar dibawah ini,

pca_dbscan_viz <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  Cluster = as.factor(db_res$cluster)
)

ggplot(pca_dbscan_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(alpha = 0.6, size = 2) +
  labs(title = "DBSCAN Results in PCA Space (0 = Noise)",
       x = "PC1", y = "PC2") +
  theme_minimal()

Mean Shift

set.seed(42)
ms_res_pca <- meanShift(X_pca)
cat("Number of clusters found:", length(unique(ms_res_pca$assignment)), "\n")
## Number of clusters found: 381
# Visualize Mean Shift results in PCA space
pca_ms_viz <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  Cluster = as.factor(ms_res_pca$assignment)
)

ggplot(pca_ms_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(alpha = 0.6, size = 2) +
  labs(title = "Mean Shift Results in PCA Space",
       x = "PC1", y = "PC2") +
  theme_minimal() +
  theme(legend.position = "none")

ari_ms <- adjustedRandIndex(ms_res_pca$assignment, data$Class)
cat("Mean Shift ARI Score (PCA):", round(ari_ms, 4), "\n")
## Mean Shift ARI Score (PCA): 0.0014

Fuzzy C-means

# --- Fuzzy C-Means ---
df <- as.data.frame(X_pca)

set.seed(42)
fcm_res <- cmeans(df, centers = 3, m = 2) # m = 2 adalah parameter fuzziness
fcm_cluster <- apply(fcm_res$membership, 1, which.max)

table(fcm_cluster)
## fcm_cluster
##   1   3 
## 186 214
table(Predicted = fcm_cluster, Actual = data$Class)
##          Actual
## Predicted angry happy relax sad
##         1    42     4    72  68
##         3    58    96    28  32
ari_fcm <- adjustedRandIndex(fcm_cluster, data$Class_Numeric)
cat("Fuzzy C-Means ARI Score (PCA):", round(ari_fcm, 4), "\n")
## Fuzzy C-Means ARI Score (PCA): 0.1434

Visualisasi clustering di PCA space

pca_fcm_viz <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  Cluster = as.factor(fcm_cluster)
)

ggplot(pca_fcm_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(alpha = 0.6, size = 2) +
  labs(title = "Fuzzy C-Means Clustering (k = 3) in PCA Space",
       x = "PC1", y = "PC2") +
  theme_minimal()

Visualisasi tingkat keanggotaan (cluster 1)

membership_df <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  Membership = fcm_res$membership[, 1]
)

ggplot(membership_df, aes(x = PC1, y = PC2, color = Membership)) +
  geom_point(size = 2) +
  scale_color_gradient(low = "blue", high = "red") +
  labs(title = "Membership Degree for Cluster 1",
       x = "PC1", y = "PC2") +
  theme_minimal()

Metrics Comparison

1. Silhouette Score

# Hitung distance matrix (pakai PCA data)
dist_matrix <- dist(X_pca)

# --- K-Means ---
sil_kmeans <- silhouette(kmeans_res$cluster, dist_matrix)
avg_sil_kmeans <- mean(sil_kmeans[, 3])

# --- K-Medians ---
sil_kmed <- silhouette(kmed_cluster, dist_matrix)
avg_sil_kmed <- mean(sil_kmed[, 3])

# --- DBSCAN ---
db_cluster <- db_res$cluster
db_valid <- db_cluster != 0

if(length(unique(db_cluster[db_valid])) > 1){
  sil_db <- silhouette(db_cluster[db_valid], dist_matrix[db_valid, db_valid])
  avg_sil_db <- mean(sil_db[, 3])
} else {
  avg_sil_db <- NA
}

# --- Mean Shift ---
sil_ms <- silhouette(ms_res_pca$assignment, dist_matrix)
avg_sil_ms <- mean(sil_ms[, 3])

# --- Fuzzy C-Means ---
sil_fcm <- silhouette(fcm_cluster, dist_matrix)
avg_sil_fcm <- mean(sil_fcm[, 3])

# Gabungkan ke tabel
sil_summary <- data.frame(
  Method = c("K-Means", "K-Medians", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
  Silhouette_Score = c(
    avg_sil_kmeans,
    avg_sil_kmed,
    avg_sil_db,
    avg_sil_ms,
    avg_sil_fcm
  )
)

# Urutkan dari terbaik
sil_summary <- sil_summary[order(-sil_summary$Silhouette_Score), ]

print(sil_summary)
##          Method Silhouette_Score
## 1       K-Means        0.1221781
## 5 Fuzzy C-Means        0.1160965
## 4    Mean Shift        0.0904467
## 2     K-Medians        0.0825339
## 3        DBSCAN               NA
ggplot(sil_summary, aes(x = reorder(Method, Silhouette_Score), y = Silhouette_Score, fill = Method)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(title = "Perbandingan Silhouette Score Semua Metode",
       x = "Metode",
       y = "Average Silhouette Score") +
  theme_minimal() +
  geom_text(aes(label = round(Silhouette_Score, 3)), hjust = -0.1)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).

2. Dunn Index (Higher is better)

df_matrix <- as.matrix(X_pca)

# --- K-Means ---
dunn_kmeans <- index.DB(df_matrix, kmeans_res$cluster)$DB

# --- K-Medians ---
dunn_kmed <- index.DB(df_matrix, kmed_cluster)$DB

# --- DBSCAN ---
db_cluster <- db_res$cluster
db_valid <- db_cluster != 0

if(length(unique(db_cluster[db_valid])) > 1){
  dunn_db <- index.DB(df_matrix[db_valid, ], db_cluster[db_valid])$DB
} else {
  dunn_db <- NA
}

# --- Mean Shift ---
dunn_ms <- index.DB(df_matrix, ms_res_pca$assignment)$DB

# --- Fuzzy C-Means ---
dunn_fcm <- index.DB(df_matrix, fcm_cluster)$DB
## Warning in max(R[i, ][is.finite(R[i, ])]): no non-missing arguments to max;
## returning -Inf
# Gabungkan ke tabel
dunn_summary <- data.frame(
  Method = c("K-Means", "K-Medians", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
  Dunn_Index = c(
    dunn_kmeans,
    dunn_kmed,
    dunn_db,
    dunn_ms,
    dunn_fcm
  )
)

# Urutkan dari terbaik (higher is better)
dunn_summary <- dunn_summary[order(-dunn_summary$Dunn_Index), ]

print(dunn_summary)
##          Method Dunn_Index
## 2     K-Medians 3.44366080
## 5 Fuzzy C-Means 2.61553444
## 1       K-Means 2.58918077
## 4    Mean Shift 0.06002086
## 3        DBSCAN         NA
ggplot(dunn_summary, aes(x = reorder(Method, Dunn_Index), y = Dunn_Index, fill = Method)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(title = "Perbandingan Dunn Index Semua Metode",
       x = "Metode",
       y = "Dunn Index") +
  theme_minimal() +
  geom_text(aes(label = round(Dunn_Index, 3)), hjust = -0.1)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).

3. ARI Score Summary

ari_summary <- data.frame(
  Method = c(
    "K-Means",
    "K-Medians",
    "DBSCAN", 
    "Mean Shift",
    "Fuzzy C-Means"
  ),
  ARI_Score = c(
    ari_kmeans,
    ari_kmed,
    ari_dbscan,
    ari_ms,
    ari_fcm
  )
)

# Urutkan dari terbaik
ari_summary <- ari_summary[order(-ari_summary$ARI_Score), ]

print(ari_summary)
##          Method   ARI_Score
## 2     K-Medians 0.213518389
## 5 Fuzzy C-Means 0.143394141
## 1       K-Means 0.129388875
## 4    Mean Shift 0.001442304
## 3        DBSCAN 0.000000000
# Visualize ARI scores
ggplot(ari_summary, aes(x = reorder(Method, ARI_Score), y = ARI_Score, fill = Method)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(title = "ARI Score Comparison",
       x = "Clustering Method",
       y = "Adjusted Rand Index") +
  theme_minimal() +
  geom_text(aes(label = round(ARI_Score, 3)), hjust = -0.1)