Package

packages <- c(
  "readxl",       
  "dplyr",        
  "tidyr",        
  "ggplot2",      
  "factoextra", 
  "cluster",     
  "clusterSim",   
  "fclust",     
  "dbscan",       
  "meanShiftR",   
  "gridExtra",    
  "scales",       
  "ggpubr",
  "mclust",
  "moments",
  "flexclust",
  "e1071"
)

installed <- packages %in% rownames(installed.packages())
if (any(!installed)) {
  install.packages(packages[!installed])
}

lapply(packages, library, character.only = TRUE)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Welcome to factoextra!

## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## Registered S3 method overwritten by 'fclust':
##   method       from 
##   print.fclust e1071

## 
## Attaching package: 'dbscan'

## The following object is masked from 'package:stats':
## 
##     as.dendrogram

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.

## 
## Attaching package: 'mclust'

## The following object is masked from 'package:dplyr':
## 
##     count

## 
## Attaching package: 'e1071'

## The following object is masked from 'package:flexclust':
## 
##     bclust

## The following objects are masked from 'package:moments':
## 
##     kurtosis, moment, skewness

## The following object is masked from 'package:ggplot2':
## 
##     element

## [[1]]
## [1] "readxl"    "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [7] "methods"   "base"     
## 
## [[2]]
## [1] "dplyr"     "readxl"    "stats"     "graphics"  "grDevices" "utils"    
## [7] "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "tidyr"     "dplyr"     "readxl"    "stats"     "graphics"  "grDevices"
##  [7] "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "ggplot2"   "tidyr"     "dplyr"     "readxl"    "stats"     "graphics" 
##  [7] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[5]]
##  [1] "factoextra" "ggplot2"    "tidyr"      "dplyr"      "readxl"    
##  [6] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [11] "methods"    "base"      
## 
## [[6]]
##  [1] "cluster"    "factoextra" "ggplot2"    "tidyr"      "dplyr"     
##  [6] "readxl"     "stats"      "graphics"   "grDevices"  "utils"     
## [11] "datasets"   "methods"    "base"      
## 
## [[7]]
##  [1] "clusterSim" "MASS"       "cluster"    "factoextra" "ggplot2"   
##  [6] "tidyr"      "dplyr"      "readxl"     "stats"      "graphics"  
## [11] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[8]]
##  [1] "fclust"     "clusterSim" "MASS"       "cluster"    "factoextra"
##  [6] "ggplot2"    "tidyr"      "dplyr"      "readxl"     "stats"     
## [11] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [16] "base"      
## 
## [[9]]
##  [1] "dbscan"     "fclust"     "clusterSim" "MASS"       "cluster"   
##  [6] "factoextra" "ggplot2"    "tidyr"      "dplyr"      "readxl"    
## [11] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [16] "methods"    "base"      
## 
## [[10]]
##  [1] "meanShiftR" "dbscan"     "fclust"     "clusterSim" "MASS"      
##  [6] "cluster"    "factoextra" "ggplot2"    "tidyr"      "dplyr"     
## [11] "readxl"     "stats"      "graphics"   "grDevices"  "utils"     
## [16] "datasets"   "methods"    "base"      
## 
## [[11]]
##  [1] "gridExtra"  "meanShiftR" "dbscan"     "fclust"     "clusterSim"
##  [6] "MASS"       "cluster"    "factoextra" "ggplot2"    "tidyr"     
## [11] "dplyr"      "readxl"     "stats"      "graphics"   "grDevices" 
## [16] "utils"      "datasets"   "methods"    "base"      
## 
## [[12]]
##  [1] "scales"     "gridExtra"  "meanShiftR" "dbscan"     "fclust"    
##  [6] "clusterSim" "MASS"       "cluster"    "factoextra" "ggplot2"   
## [11] "tidyr"      "dplyr"      "readxl"     "stats"      "graphics"  
## [16] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[13]]
##  [1] "ggpubr"     "scales"     "gridExtra"  "meanShiftR" "dbscan"    
##  [6] "fclust"     "clusterSim" "MASS"       "cluster"    "factoextra"
## [11] "ggplot2"    "tidyr"      "dplyr"      "readxl"     "stats"     
## [16] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [21] "base"      
## 
## [[14]]
##  [1] "mclust"     "ggpubr"     "scales"     "gridExtra"  "meanShiftR"
##  [6] "dbscan"     "fclust"     "clusterSim" "MASS"       "cluster"   
## [11] "factoextra" "ggplot2"    "tidyr"      "dplyr"      "readxl"    
## [16] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [21] "methods"    "base"      
## 
## [[15]]
##  [1] "moments"    "mclust"     "ggpubr"     "scales"     "gridExtra" 
##  [6] "meanShiftR" "dbscan"     "fclust"     "clusterSim" "MASS"      
## [11] "cluster"    "factoextra" "ggplot2"    "tidyr"      "dplyr"     
## [16] "readxl"     "stats"      "graphics"   "grDevices"  "utils"     
## [21] "datasets"   "methods"    "base"      
## 
## [[16]]
##  [1] "flexclust"  "moments"    "mclust"     "ggpubr"     "scales"    
##  [6] "gridExtra"  "meanShiftR" "dbscan"     "fclust"     "clusterSim"
## [11] "MASS"       "cluster"    "factoextra" "ggplot2"    "tidyr"     
## [16] "dplyr"      "readxl"     "stats"      "graphics"   "grDevices" 
## [21] "utils"      "datasets"   "methods"    "base"      
## 
## [[17]]
##  [1] "e1071"      "flexclust"  "moments"    "mclust"     "ggpubr"    
##  [6] "scales"     "gridExtra"  "meanShiftR" "dbscan"     "fclust"    
## [11] "clusterSim" "MASS"       "cluster"    "factoextra" "ggplot2"   
## [16] "tidyr"      "dplyr"      "readxl"     "stats"      "graphics"  
## [21] "grDevices"  "utils"      "datasets"   "methods"    "base"

Load Data

data <- read.csv("Acoustic Features.csv", sep=",", header=TRUE)
X <- data[,c(2:51)]

# Convert class labels to numeric factors
data$Class_Numeric <- as.numeric(as.factor(data$Class))

# Show mapping
class_mapping <- data.frame(
  Original_Class = levels(as.factor(data$Class)),
  Numeric_Code = 1:length(levels(as.factor(data$Class)))
)

cat("Class Mapping (Numeric Encoding):\n")

## Class Mapping (Numeric Encoding):

print(class_mapping)

##   Original_Class Numeric_Code
## 1          angry            1
## 2          happy            2
## 3          relax            3
## 4            sad            4

# View first few rows
cat("\nFirst 10 rows:\n")

## 
## First 10 rows:

print(head(data[, c("Class", "Class_Numeric")], 10))

##    Class Class_Numeric
## 1  relax             3
## 2  relax             3
## 3  relax             3
## 4  relax             3
## 5  relax             3
## 6  relax             3
## 7  relax             3
## 8  relax             3
## 9  relax             3
## 10 relax             3

EDA

Sebaran data

tabel_deskriptif <- X %>%
  summarise(across(everything(), list(
    n        = ~sum(!is.na(.)),
    Mean     = ~mean(., na.rm = TRUE),
    Median   = ~median(., na.rm = TRUE),
    SD       = ~sd(., na.rm = TRUE),
    Min      = ~min(., na.rm = TRUE),
    Max      = ~max(., na.rm = TRUE),
    Skewness = ~moments::skewness(., na.rm = TRUE),
    Kurtosis = ~moments::kurtosis(., na.rm = TRUE)
  ))) %>%
  pivot_longer(
    cols = everything(),
    names_to = c("Variable", ".value"),
    names_pattern = "(.*)_(.*)"
  )

print(tabel_deskriptif, n = 50)

## # A tibble: 50 × 9
##    Variable        n     Mean   Median      SD     Min     Max Skewness Kurtosis
##    <chr>       <int>    <dbl>    <dbl>   <dbl>   <dbl>   <dbl>    <dbl>    <dbl>
##  1 X_RMSenerg…   400  1.35e-1  1.28e-1 6.44e-2   0.01  4.31e-1   0.703      3.60
##  2 X_Lowenerg…   400  5.54e-1  5.53e-1 5.07e-2   0.302 7.03e-1  -0.386      5.13
##  3 X_Fluctuat…   400  7.15e+0  6.73e+0 2.28e+0   3.58  2.35e+1   2.88      15.6 
##  4 X_Tempo_Me…   400  1.24e+2  1.20e+2 3.42e+1  48.3   1.95e+2   0.116      2.37
##  5 X_MFCC_Mea…   400  2.46e+0  2.39e+0 7.99e-1   0.323 6.00e+0   0.865      4.91
##  6 X_MFCC_Mea…   400  7.19e-2  6.85e-2 5.38e-1  -3.48  1.94e+0  -0.677      7.83
##  7 X_MFCC_Mea…   400  4.88e-1  4.65e-1 2.95e-1  -0.87  1.62e+0   0.111      4.16
##  8 X_MFCC_Mea…   400  3.05e-2  4.45e-2 2.76e-1  -1.64  1.13e+0  -0.695      7.01
##  9 X_MFCC_Mea…   400  1.79e-1  1.81e-1 1.95e-1  -0.494 1.06e+0   0.0748     4.16
## 10 X_MFCC_Mea…   400  3.83e-2  4.95e-2 2.04e-1  -0.916 7.99e-1  -0.283      5.53
## 11 X_MFCC_Mea…   400  5.99e-2  7.2 e-2 1.81e-1  -0.936 5.71e-1  -0.790      6.30
## 12 X_MFCC_Mea…   400  4.35e-2  3.95e-2 1.65e-1  -0.744 7.28e-1  -0.107      5.81
## 13 X_MFCC_Mea…   400  2.30e-2  1.65e-2 1.59e-1  -0.621 5.39e-1  -0.129      4.26
## 14 X_MFCC_Mea…   400  2.78e-2  3.15e-2 1.52e-1  -0.544 5.1 e-1  -0.246      4.05
## 15 X_MFCC_Mea…   400  2.88e-2  3.7 e-2 1.36e-1  -0.487 4.94e-1  -0.297      3.88
## 16 X_MFCC_Mea…   400  1.67e-2  2.25e-2 1.29e-1  -0.418 3.55e-1  -0.350      3.38
## 17 X_MFCC_Mea…   400  2.41e-2  3.9 e-2 1.33e-1  -0.62  5.36e-1  -0.493      4.98
## 18 X_Roughnes…   400  5.28e+2  3.68e+2 5.21e+2   0.941 3.90e+3   2.08       9.63
## 19 X_Roughnes…   400  7.20e-2  6.8 e-2 1.74e-1  -0.525 5.84e-1  -0.0234     3.40
## 20 X_Zero.cro…   400  9.97e+2  8.93e+2 5.25e+2 149.    3.15e+3   0.923      3.66
## 21 X_AttackTi…   400  3.13e-2  2.7 e-2 1.68e-2   0.01  1.65e-1   3.37      19.9 
## 22 X_AttackTi…   400 -2.89e-3  7.5 e-3 1.50e-1  -0.465 5.99e-1  -0.0114     3.45
## 23 X_Rolloff_…   400  5.69e+3  5.65e+3 2.29e+3 887.    1.15e+4   0.0959     2.38
## 24 X_Eventden…   400  2.78e+0  2.77e+0 1.33e+0   0.234 7.95e+0   0.474      3.12
## 25 X_Pulsecla…   400  2.49e-1  2.18e-1 1.55e-1   0.011 8.56e-1   1.16       4.18
## 26 X_Brightne…   400  4.34e-1  4.48e-1 1.32e-1   0.053 7.37e-1  -0.398      2.91
## 27 X_Spectral…   400  2.58e+3  2.55e+3 8.64e+2 607.    5.33e+3   0.224      2.85
## 28 X_Spectral…   400  3.08e+3  3.15e+3 7.68e+2 815.    4.72e+3  -0.259      2.56
## 29 X_Spectral…   400  1.87e+0  1.69e+0 8.82e-1   0.39  7.86e+0   2.27      12.0 
## 30 X_Spectral…   400  7.35e+0  5.22e+0 8.62e+0   1.93  1.22e+2   7.94      91.5 
## 31 X_Spectral…   400  4.85e-2  4.7 e-2 2.65e-2   0.006 2.09e-1   1.48       8.09
## 32 X_Entropyo…   400  8.73e-1  8.79e-1 3.73e-2   0.74  9.42e-1  -0.972      3.93
## 33 X_Chromagr…   400  3.53e-1  2.74e-1 3.23e-1   0     1   e+0   0.700      2.27
## 34 X_Chromagr…   400  2.53e-1  1.42e-1 2.88e-1   0     1   e+0   1.20       3.41
## 35 X_Chromagr…   400  3.65e-1  2.88e-1 3.25e-1   0     1   e+0   0.691      2.27
## 36 X_Chromagr…   400  2.08e-1  1.05e-1 2.54e-1   0     1   e+0   1.48       4.44
## 37 X_Chromagr…   400  3.50e-1  2.71e-1 3.04e-1   0     1   e+0   0.781      2.53
## 38 X_Chromagr…   400  2.64e-1  1.44e-1 2.93e-1   0     1   e+0   1.11       3.17
## 39 X_Chromagr…   400  2.43e-1  1.41e-1 2.76e-1   0     1   e+0   1.33       3.90
## 40 X_Chromagr…   400  3.92e-1  2.95e-1 3.31e-1   0     1   e+0   0.570      2.04
## 41 X_Chromagr…   400  3.55e-1  2.47e-1 3.35e-1   0     1   e+0   0.759      2.22
## 42 X_Chromagr…   400  5.91e-1  6.12e-1 3.58e-1   0     1   e+0  -0.218      1.56
## 43 X_Chromagr…   400  3.42e-1  2.47e-1 3.16e-1   0     1   e+0   0.711      2.30
## 44 X_Chromagr…   400  3.86e-1  2.96e-1 3.48e-1   0     1   e+0   0.546      1.88
## 45 X_Harmonic…   400  3.28e-1  3.33e-1 5.55e-2   0.112 4.88e-1  -0.464      3.47
## 46 X_Harmonic…   400  1.93e-1  1.9 e-1 4.71e-2   0.06  3.4 e-1   0.244      2.87
## 47 X_Harmonic…   400 -1.58e-4 -2   e-3 1.05e-1  -0.285 4.42e-1   0.200      3.90
## 48 X_Harmonic…   400  1.76e+0  1.68e+0 9.30e-1   0.187 4.49e+0   0.397      2.70
## 49 X_Harmonic…   400  7.70e-1  7.86e-1 7.21e-2   0.53  9.08e-1  -0.761      3.16
## 50 X_Harmonic…   400  9.67e-1  9.67e-1 3.84e-3   0.939 9.77e-1  -1.48      10.5

Standarisasi

X_scaled <- scale(X)

PCA

pca_result <- prcomp(X_scaled, center = FALSE, scale. = FALSE)

var_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)
n_pc_80 <- which(cumsum(var_explained) >= 0.80)
X_pca <- pca_result$x[, 1:n_pc_80]

## Warning in 1:n_pc_80: numerical expression has 29 elements: only the first used

cat("Original dimensions:", ncol(X_scaled), "\n")

## Original dimensions: 50

cat("PCA dimensions :", ncol(X_pca), "\n")

## PCA dimensions : 22

Hopkins Statistic (Original vs PCA)

set.seed(42)
hop_original <- get_clust_tendency(X_scaled, n = 100, graph = FALSE)

## Warning: Hopkins statistic uses the corrected formula (Wright 2022); results
## differ from legacy factoextra. Set options(factoextra.warn_hopkins = FALSE) to
## silence this warning.

cat("Hopkins Statistic (Original):", round(hop_original$hopkins_stat, 4), "\n")

## Hopkins Statistic (Original): 1

hop_pca <- get_clust_tendency(X_pca, n = 100, graph = FALSE)
cat("Hopkins Statistic (PCA)     :", round(hop_pca$hopkins_stat, 4), "\n")

## Hopkins Statistic (PCA)     : 1

Menentukan eps DBSCAN (PCA Data)

kNNdistplot(X_pca, k = 23)
abline(h = 8, col = "red", lty = 2)

Clustering Model

K-Means

Menentukan jumlah cluster optimal (Elbow & Silhouette)

set.seed(42)

# Elbow Method
fviz_nbclust(X_pca, kmeans, method = "wss") +
  labs(title = "Elbow Method untuk Optimal k")

# Silhouette Method
fviz_nbclust(X_pca, kmeans, method = "silhouette") +
  labs(title = "Silhouette Method untuk Optimal k")

# Berdasarkan hasil eksplorasi, dipilih k optimal = 2
k_optimal <- 2

set.seed(42)
kmeans_res <- kmeans(X_pca, centers = k_optimal, nstart = 25)

kmeans_res

## K-means clustering with 2 clusters of sizes 231, 169
## 
## Cluster means:
##         PC1         PC2        PC3         PC4         PC5         PC6
## 1  1.936972 -0.05734675 -0.1078317 -0.04573734 -0.02410952 -0.07261830
## 2 -2.647577  0.07838521  0.1473912  0.06251672  0.03295443  0.09925933
##           PC7         PC8         PC9        PC10        PC11        PC12
## 1  0.05040918  0.03321966 -0.01913624 -0.03111916 -0.03432697 -0.02435921
## 2 -0.06890249 -0.04540676  0.02615663  0.04253565  0.04692029  0.03329573
##           PC13        PC14        PC15        PC16        PC17        PC18
## 1 -0.008972421  0.02160526  0.02161278 -0.03202798 -0.02217016  0.04579152
## 2  0.012264079 -0.02953144 -0.02954172  0.04377790  0.03030359 -0.06259078
##          PC19        PC20        PC21        PC22
## 1  0.01398905  0.08423306  0.09094821  0.05117816
## 2 -0.01912113 -0.11513513 -0.12431382 -0.06995358
## 
## Clustering vector:
##   [1] 2 2 1 2 2 2 2 2 2 2 2 1 2 2 1 1 2 1 2 1 1 1 2 1 1 2 2 2 1 2 2 2 2 2 2 2 1
##  [38] 1 1 2 2 1 2 1 1 2 2 1 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 1 1 1 1 1 2
##  [75] 1 2 2 2 1 1 2 1 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 2 1 2 2 1 1 2 1 2 2 2 1
## [223] 1 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 1 2 2 1 2 2 1 2 2 2 2 2 1 1 2 2 1 2 2 1
## [260] 1 2 2 2 2 1 1 1 2 2 2 1 2 2 2 2 2 2 2 1 1 1 1 2 2 2 1 2 2 1 2 2 2 1 2 1 1
## [297] 2 2 1 2 1 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 2 2 1 1 1 1 1 2 1 1 2 1 1 2
## [334] 2 2 1 2 1 1 2 1 1 2 1 1 1 1 2 1 2 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 2 2 2 2
## [371] 1 1 2 2 2 1 2 1 1 2 1 1 1 2 1 2 1 2 2 1 2 1 1 2 1 1 1 1 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 7309.459 6766.697
##  (between_SS / total_SS =  12.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

table(Predicted = kmeans_res$cluster, Actual = data$Class)

##          Actual
## Predicted angry happy relax sad
##         1    63    98    35  35
##         2    37     2    65  65

ari_kmeans <- adjustedRandIndex(kmeans_res$cluster, data$Class_Numeric)
cat("K-Means ARI Score (PCA):", round(ari_kmeans, 4), "\n")

## K-Means ARI Score (PCA): 0.1294

Visualisasi hasil clustering pada PCA (PC1 vs PC2)

pca_kmeans_viz <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  Cluster = as.factor(kmeans_res$cluster)
)

ggplot(pca_kmeans_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(alpha = 0.6, size = 2) +
  labs(title = paste("K-Means Clustering (k =", k_optimal, ") in PCA Space"),
       x = "PC1", y = "PC2") +
  theme_minimal()

K-Medians

df <- as.data.frame(X_pca)

set.seed(42)
kmed_res <- kcca(df, k = 3, family = kccaFamily("kmedians"))

kmed_res

## kcca object of family 'kmedians' 
## 
## call:
## kcca(x = df, k = 3, family = kccaFamily("kmedians"))
## 
## cluster sizes:
## 
##   1   2   3 
## 135 104 161

kmed_cluster <- as.vector(clusters(kmed_res))

table(kmed_cluster)

## kmed_cluster
##   1   2   3 
## 135 104 161

table(Predicted = kmed_cluster, Actual = data$Class)

##          Actual
## Predicted angry happy relax sad
##         1    53     9    29  44
##         2     6     0    59  39
##         3    41    91    12  17

ari_kmed <- adjustedRandIndex(kmed_cluster, data$Class_Numeric)
cat("K-Medians ARI Score (PCA):", round(ari_kmed, 4), "\n")

## K-Medians ARI Score (PCA): 0.2135

Visualisasi clustering di PCA space

pca_kmed_viz <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  Cluster = as.factor(kmed_cluster)
)

ggplot(pca_kmed_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(alpha = 0.6, size = 2) +
  labs(title = "K-Medians Clustering (k = 3) in PCA Space",
       x = "PC1", y = "PC2") +
  theme_minimal()

Evaluasi Silhouette Score

sil_kmed <- silhouette(kmed_cluster, dist(df))
avg_sil_kmed <- mean(sil_kmed[, 3])
cat("Average Silhouette Score (K-Medians):", round(avg_sil_kmed, 4), "\n")

## Average Silhouette Score (K-Medians): 0.0825

DBSCAN on PCA Data

set.seed(42)
db_res <- dbscan(X_pca, eps = 12, minPts = 23)
db_res

## DBSCAN clustering for 400 objects.
## Parameters: eps = 12, minPts = 23
## Using euclidean distances and borderpoints = TRUE
## The clustering contains 1 cluster(s) and 1 noise points.
## 
##   0   1 
##   1 399 
## 
## Available fields: cluster, eps, minPts, metric, borderPoints

db_res$cluster

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

table(Predicted = db_res$cluster, Actual = data$Class)

##          Actual
## Predicted angry happy relax sad
##         0     0     0     0   1
##         1   100   100   100  99

ari_dbscan <- adjustedRandIndex(db_res$cluster, data$Class_Numeric)
ari_dbscan

## [1] 0

DBSCAN gagal melakukan clustering pada 4 kelas. DBSCAN hanya menangkap 1 cluster. Ini terjadi karena nilai yang berdekatan yang dapat dilihat pada gambar dibawah ini,

pca_dbscan_viz <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  Cluster = as.factor(db_res$cluster)
)

ggplot(pca_dbscan_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(alpha = 0.6, size = 2) +
  labs(title = "DBSCAN Results in PCA Space (0 = Noise)",
       x = "PC1", y = "PC2") +
  theme_minimal()

Mean Shift

set.seed(42)
ms_res_pca <- meanShift(X_pca)
cat("Number of clusters found:", length(unique(ms_res_pca$assignment)), "\n")

## Number of clusters found: 381

# Visualize Mean Shift results in PCA space
pca_ms_viz <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  Cluster = as.factor(ms_res_pca$assignment)
)

ggplot(pca_ms_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(alpha = 0.6, size = 2) +
  labs(title = "Mean Shift Results in PCA Space",
       x = "PC1", y = "PC2") +
  theme_minimal() +
  theme(legend.position = "none")

ari_ms <- adjustedRandIndex(ms_res_pca$assignment, data$Class)
cat("Mean Shift ARI Score (PCA):", round(ari_ms, 4), "\n")

## Mean Shift ARI Score (PCA): 0.0014

Fuzzy C-means

# --- Fuzzy C-Means ---
df <- as.data.frame(X_pca)

set.seed(42)
fcm_res <- cmeans(df, centers = 3, m = 2) # m = 2 adalah parameter fuzziness

fcm_cluster <- apply(fcm_res$membership, 1, which.max)

table(fcm_cluster)

## fcm_cluster
##   1   3 
## 186 214

table(Predicted = fcm_cluster, Actual = data$Class)

##          Actual
## Predicted angry happy relax sad
##         1    42     4    72  68
##         3    58    96    28  32

ari_fcm <- adjustedRandIndex(fcm_cluster, data$Class_Numeric)
cat("Fuzzy C-Means ARI Score (PCA):", round(ari_fcm, 4), "\n")

## Fuzzy C-Means ARI Score (PCA): 0.1434

Visualisasi clustering di PCA space

pca_fcm_viz <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  Cluster = as.factor(fcm_cluster)
)

ggplot(pca_fcm_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(alpha = 0.6, size = 2) +
  labs(title = "Fuzzy C-Means Clustering (k = 3) in PCA Space",
       x = "PC1", y = "PC2") +
  theme_minimal()

Visualisasi tingkat keanggotaan (cluster 1)

membership_df <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  Membership = fcm_res$membership[, 1]
)

ggplot(membership_df, aes(x = PC1, y = PC2, color = Membership)) +
  geom_point(size = 2) +
  scale_color_gradient(low = "blue", high = "red") +
  labs(title = "Membership Degree for Cluster 1",
       x = "PC1", y = "PC2") +
  theme_minimal()

Metrics Comparison

1. Silhouette Score

# Hitung distance matrix (pakai PCA data)
dist_matrix <- dist(X_pca)

# --- K-Means ---
sil_kmeans <- silhouette(kmeans_res$cluster, dist_matrix)
avg_sil_kmeans <- mean(sil_kmeans[, 3])

# --- K-Medians ---
sil_kmed <- silhouette(kmed_cluster, dist_matrix)
avg_sil_kmed <- mean(sil_kmed[, 3])

# --- DBSCAN ---
db_cluster <- db_res$cluster
db_valid <- db_cluster != 0

if(length(unique(db_cluster[db_valid])) > 1){
  sil_db <- silhouette(db_cluster[db_valid], dist_matrix[db_valid, db_valid])
  avg_sil_db <- mean(sil_db[, 3])
} else {
  avg_sil_db <- NA
}

# --- Mean Shift ---
sil_ms <- silhouette(ms_res_pca$assignment, dist_matrix)
avg_sil_ms <- mean(sil_ms[, 3])

# --- Fuzzy C-Means ---
sil_fcm <- silhouette(fcm_cluster, dist_matrix)
avg_sil_fcm <- mean(sil_fcm[, 3])

# Gabungkan ke tabel
sil_summary <- data.frame(
  Method = c("K-Means", "K-Medians", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
  Silhouette_Score = c(
    avg_sil_kmeans,
    avg_sil_kmed,
    avg_sil_db,
    avg_sil_ms,
    avg_sil_fcm
  )
)

# Urutkan dari terbaik
sil_summary <- sil_summary[order(-sil_summary$Silhouette_Score), ]

print(sil_summary)

##          Method Silhouette_Score
## 1       K-Means        0.1221781
## 5 Fuzzy C-Means        0.1160965
## 4    Mean Shift        0.0904467
## 2     K-Medians        0.0825339
## 3        DBSCAN               NA

ggplot(sil_summary, aes(x = reorder(Method, Silhouette_Score), y = Silhouette_Score, fill = Method)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(title = "Perbandingan Silhouette Score Semua Metode",
       x = "Metode",
       y = "Average Silhouette Score") +
  theme_minimal() +
  geom_text(aes(label = round(Silhouette_Score, 3)), hjust = -0.1)

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).

2. Dunn Index (Higher is better)

df_matrix <- as.matrix(X_pca)

# --- K-Means ---
dunn_kmeans <- index.DB(df_matrix, kmeans_res$cluster)$DB

# --- K-Medians ---
dunn_kmed <- index.DB(df_matrix, kmed_cluster)$DB

# --- DBSCAN ---
db_cluster <- db_res$cluster
db_valid <- db_cluster != 0

if(length(unique(db_cluster[db_valid])) > 1){
  dunn_db <- index.DB(df_matrix[db_valid, ], db_cluster[db_valid])$DB
} else {
  dunn_db <- NA
}

# --- Mean Shift ---
dunn_ms <- index.DB(df_matrix, ms_res_pca$assignment)$DB

# --- Fuzzy C-Means ---
dunn_fcm <- index.DB(df_matrix, fcm_cluster)$DB

## Warning in max(R[i, ][is.finite(R[i, ])]): no non-missing arguments to max;
## returning -Inf

# Gabungkan ke tabel
dunn_summary <- data.frame(
  Method = c("K-Means", "K-Medians", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
  Dunn_Index = c(
    dunn_kmeans,
    dunn_kmed,
    dunn_db,
    dunn_ms,
    dunn_fcm
  )
)

# Urutkan dari terbaik (higher is better)
dunn_summary <- dunn_summary[order(-dunn_summary$Dunn_Index), ]

print(dunn_summary)

##          Method Dunn_Index
## 2     K-Medians 3.44366080
## 5 Fuzzy C-Means 2.61553444
## 1       K-Means 2.58918077
## 4    Mean Shift 0.06002086
## 3        DBSCAN         NA

ggplot(dunn_summary, aes(x = reorder(Method, Dunn_Index), y = Dunn_Index, fill = Method)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(title = "Perbandingan Dunn Index Semua Metode",
       x = "Metode",
       y = "Dunn Index") +
  theme_minimal() +
  geom_text(aes(label = round(Dunn_Index, 3)), hjust = -0.1)

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).

3. ARI Score Summary

ari_summary <- data.frame(
  Method = c(
    "K-Means",
    "K-Medians",
    "DBSCAN", 
    "Mean Shift",
    "Fuzzy C-Means"
  ),
  ARI_Score = c(
    ari_kmeans,
    ari_kmed,
    ari_dbscan,
    ari_ms,
    ari_fcm
  )
)

# Urutkan dari terbaik
ari_summary <- ari_summary[order(-ari_summary$ARI_Score), ]

print(ari_summary)

##          Method   ARI_Score
## 2     K-Medians 0.213518389
## 5 Fuzzy C-Means 0.143394141
## 1       K-Means 0.129388875
## 4    Mean Shift 0.001442304
## 3        DBSCAN 0.000000000

# Visualize ARI scores
ggplot(ari_summary, aes(x = reorder(Method, ARI_Score), y = ARI_Score, fill = Method)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(title = "ARI Score Comparison",
       x = "Clustering Method",
       y = "Adjusted Rand Index") +
  theme_minimal() +
  geom_text(aes(label = round(ARI_Score, 3)), hjust = -0.1)

Module 3 - Clustering with PCA

E. Adi Sforza Syahrul Ramadhan (24031554113)
Ahmad Dhani Alfawwas (24031554096)

2026-04-06

Package

Load Data

EDA

Sebaran data

Standarisasi

PCA

Hopkins Statistic (Original vs PCA)

Menentukan eps DBSCAN (PCA Data)

Clustering Model

K-Means

Menentukan jumlah cluster optimal (Elbow & Silhouette)

Visualisasi hasil clustering pada PCA (PC1 vs PC2)

K-Medians

Visualisasi clustering di PCA space

Evaluasi Silhouette Score

DBSCAN on PCA Data

Mean Shift

Fuzzy C-means

Visualisasi clustering di PCA space

Visualisasi tingkat keanggotaan (cluster 1)

Metrics Comparison

1. Silhouette Score

2. Dunn Index (Higher is better)

3. ARI Score Summary

Module 3 - Clustering with PCA

E. Adi Sforza Syahrul Ramadhan (24031554113) Ahmad Dhani Alfawwas (24031554096)

2026-04-06

Package

Load Data

EDA

Sebaran data

Standarisasi

PCA

Hopkins Statistic (Original vs PCA)

Menentukan eps DBSCAN (PCA Data)

Clustering Model

K-Means

Menentukan jumlah cluster optimal (Elbow & Silhouette)

Visualisasi hasil clustering pada PCA (PC1 vs PC2)

K-Medians

Visualisasi clustering di PCA space

Evaluasi Silhouette Score

DBSCAN on PCA Data

Mean Shift

Fuzzy C-means

Visualisasi clustering di PCA space

Visualisasi tingkat keanggotaan (cluster 1)

Metrics Comparison

1. Silhouette Score

2. Dunn Index (Higher is better)

3. ARI Score Summary

E. Adi Sforza Syahrul Ramadhan (24031554113)
Ahmad Dhani Alfawwas (24031554096)