E. Adi Sforza Syahrul Ramadhan
(24031554113)
Ahmad Dhani Alfawwas (24031554096)
packages <- c(
"readxl",
"dplyr",
"tidyr",
"ggplot2",
"factoextra",
"cluster",
"clusterSim",
"fclust",
"dbscan",
"meanShiftR",
"gridExtra",
"scales",
"ggpubr",
"mclust",
"moments",
"flexclust",
"e1071"
)
installed <- packages %in% rownames(installed.packages())
if (any(!installed)) {
install.packages(packages[!installed])
}
lapply(packages, library, character.only = TRUE)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Registered S3 method overwritten by 'fclust':
## method from
## print.fclust e1071
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'e1071'
## The following object is masked from 'package:flexclust':
##
## bclust
## The following objects are masked from 'package:moments':
##
## kurtosis, moment, skewness
## The following object is masked from 'package:ggplot2':
##
## element
## [[1]]
## [1] "readxl" "stats" "graphics" "grDevices" "utils" "datasets"
## [7] "methods" "base"
##
## [[2]]
## [1] "dplyr" "readxl" "stats" "graphics" "grDevices" "utils"
## [7] "datasets" "methods" "base"
##
## [[3]]
## [1] "tidyr" "dplyr" "readxl" "stats" "graphics" "grDevices"
## [7] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "ggplot2" "tidyr" "dplyr" "readxl" "stats" "graphics"
## [7] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "factoextra" "ggplot2" "tidyr" "dplyr" "readxl"
## [6] "stats" "graphics" "grDevices" "utils" "datasets"
## [11] "methods" "base"
##
## [[6]]
## [1] "cluster" "factoextra" "ggplot2" "tidyr" "dplyr"
## [6] "readxl" "stats" "graphics" "grDevices" "utils"
## [11] "datasets" "methods" "base"
##
## [[7]]
## [1] "clusterSim" "MASS" "cluster" "factoextra" "ggplot2"
## [6] "tidyr" "dplyr" "readxl" "stats" "graphics"
## [11] "grDevices" "utils" "datasets" "methods" "base"
##
## [[8]]
## [1] "fclust" "clusterSim" "MASS" "cluster" "factoextra"
## [6] "ggplot2" "tidyr" "dplyr" "readxl" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[9]]
## [1] "dbscan" "fclust" "clusterSim" "MASS" "cluster"
## [6] "factoextra" "ggplot2" "tidyr" "dplyr" "readxl"
## [11] "stats" "graphics" "grDevices" "utils" "datasets"
## [16] "methods" "base"
##
## [[10]]
## [1] "meanShiftR" "dbscan" "fclust" "clusterSim" "MASS"
## [6] "cluster" "factoextra" "ggplot2" "tidyr" "dplyr"
## [11] "readxl" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[11]]
## [1] "gridExtra" "meanShiftR" "dbscan" "fclust" "clusterSim"
## [6] "MASS" "cluster" "factoextra" "ggplot2" "tidyr"
## [11] "dplyr" "readxl" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[12]]
## [1] "scales" "gridExtra" "meanShiftR" "dbscan" "fclust"
## [6] "clusterSim" "MASS" "cluster" "factoextra" "ggplot2"
## [11] "tidyr" "dplyr" "readxl" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[13]]
## [1] "ggpubr" "scales" "gridExtra" "meanShiftR" "dbscan"
## [6] "fclust" "clusterSim" "MASS" "cluster" "factoextra"
## [11] "ggplot2" "tidyr" "dplyr" "readxl" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[14]]
## [1] "mclust" "ggpubr" "scales" "gridExtra" "meanShiftR"
## [6] "dbscan" "fclust" "clusterSim" "MASS" "cluster"
## [11] "factoextra" "ggplot2" "tidyr" "dplyr" "readxl"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[15]]
## [1] "moments" "mclust" "ggpubr" "scales" "gridExtra"
## [6] "meanShiftR" "dbscan" "fclust" "clusterSim" "MASS"
## [11] "cluster" "factoextra" "ggplot2" "tidyr" "dplyr"
## [16] "readxl" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[16]]
## [1] "flexclust" "moments" "mclust" "ggpubr" "scales"
## [6] "gridExtra" "meanShiftR" "dbscan" "fclust" "clusterSim"
## [11] "MASS" "cluster" "factoextra" "ggplot2" "tidyr"
## [16] "dplyr" "readxl" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
##
## [[17]]
## [1] "e1071" "flexclust" "moments" "mclust" "ggpubr"
## [6] "scales" "gridExtra" "meanShiftR" "dbscan" "fclust"
## [11] "clusterSim" "MASS" "cluster" "factoextra" "ggplot2"
## [16] "tidyr" "dplyr" "readxl" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
data <- read.csv("Acoustic Features.csv", sep=",", header=TRUE)
X <- data[,c(2:51)]
# Convert class labels to numeric factors
data$Class_Numeric <- as.numeric(as.factor(data$Class))
# Show mapping
class_mapping <- data.frame(
Original_Class = levels(as.factor(data$Class)),
Numeric_Code = 1:length(levels(as.factor(data$Class)))
)
cat("Class Mapping (Numeric Encoding):\n")
## Class Mapping (Numeric Encoding):
print(class_mapping)
## Original_Class Numeric_Code
## 1 angry 1
## 2 happy 2
## 3 relax 3
## 4 sad 4
# View first few rows
cat("\nFirst 10 rows:\n")
##
## First 10 rows:
print(head(data[, c("Class", "Class_Numeric")], 10))
## Class Class_Numeric
## 1 relax 3
## 2 relax 3
## 3 relax 3
## 4 relax 3
## 5 relax 3
## 6 relax 3
## 7 relax 3
## 8 relax 3
## 9 relax 3
## 10 relax 3
tabel_deskriptif <- X %>%
summarise(across(everything(), list(
n = ~sum(!is.na(.)),
Mean = ~mean(., na.rm = TRUE),
Median = ~median(., na.rm = TRUE),
SD = ~sd(., na.rm = TRUE),
Min = ~min(., na.rm = TRUE),
Max = ~max(., na.rm = TRUE),
Skewness = ~moments::skewness(., na.rm = TRUE),
Kurtosis = ~moments::kurtosis(., na.rm = TRUE)
))) %>%
pivot_longer(
cols = everything(),
names_to = c("Variable", ".value"),
names_pattern = "(.*)_(.*)"
)
print(tabel_deskriptif, n = 50)
## # A tibble: 50 × 9
## Variable n Mean Median SD Min Max Skewness Kurtosis
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 X_RMSenerg… 400 1.35e-1 1.28e-1 6.44e-2 0.01 4.31e-1 0.703 3.60
## 2 X_Lowenerg… 400 5.54e-1 5.53e-1 5.07e-2 0.302 7.03e-1 -0.386 5.13
## 3 X_Fluctuat… 400 7.15e+0 6.73e+0 2.28e+0 3.58 2.35e+1 2.88 15.6
## 4 X_Tempo_Me… 400 1.24e+2 1.20e+2 3.42e+1 48.3 1.95e+2 0.116 2.37
## 5 X_MFCC_Mea… 400 2.46e+0 2.39e+0 7.99e-1 0.323 6.00e+0 0.865 4.91
## 6 X_MFCC_Mea… 400 7.19e-2 6.85e-2 5.38e-1 -3.48 1.94e+0 -0.677 7.83
## 7 X_MFCC_Mea… 400 4.88e-1 4.65e-1 2.95e-1 -0.87 1.62e+0 0.111 4.16
## 8 X_MFCC_Mea… 400 3.05e-2 4.45e-2 2.76e-1 -1.64 1.13e+0 -0.695 7.01
## 9 X_MFCC_Mea… 400 1.79e-1 1.81e-1 1.95e-1 -0.494 1.06e+0 0.0748 4.16
## 10 X_MFCC_Mea… 400 3.83e-2 4.95e-2 2.04e-1 -0.916 7.99e-1 -0.283 5.53
## 11 X_MFCC_Mea… 400 5.99e-2 7.2 e-2 1.81e-1 -0.936 5.71e-1 -0.790 6.30
## 12 X_MFCC_Mea… 400 4.35e-2 3.95e-2 1.65e-1 -0.744 7.28e-1 -0.107 5.81
## 13 X_MFCC_Mea… 400 2.30e-2 1.65e-2 1.59e-1 -0.621 5.39e-1 -0.129 4.26
## 14 X_MFCC_Mea… 400 2.78e-2 3.15e-2 1.52e-1 -0.544 5.1 e-1 -0.246 4.05
## 15 X_MFCC_Mea… 400 2.88e-2 3.7 e-2 1.36e-1 -0.487 4.94e-1 -0.297 3.88
## 16 X_MFCC_Mea… 400 1.67e-2 2.25e-2 1.29e-1 -0.418 3.55e-1 -0.350 3.38
## 17 X_MFCC_Mea… 400 2.41e-2 3.9 e-2 1.33e-1 -0.62 5.36e-1 -0.493 4.98
## 18 X_Roughnes… 400 5.28e+2 3.68e+2 5.21e+2 0.941 3.90e+3 2.08 9.63
## 19 X_Roughnes… 400 7.20e-2 6.8 e-2 1.74e-1 -0.525 5.84e-1 -0.0234 3.40
## 20 X_Zero.cro… 400 9.97e+2 8.93e+2 5.25e+2 149. 3.15e+3 0.923 3.66
## 21 X_AttackTi… 400 3.13e-2 2.7 e-2 1.68e-2 0.01 1.65e-1 3.37 19.9
## 22 X_AttackTi… 400 -2.89e-3 7.5 e-3 1.50e-1 -0.465 5.99e-1 -0.0114 3.45
## 23 X_Rolloff_… 400 5.69e+3 5.65e+3 2.29e+3 887. 1.15e+4 0.0959 2.38
## 24 X_Eventden… 400 2.78e+0 2.77e+0 1.33e+0 0.234 7.95e+0 0.474 3.12
## 25 X_Pulsecla… 400 2.49e-1 2.18e-1 1.55e-1 0.011 8.56e-1 1.16 4.18
## 26 X_Brightne… 400 4.34e-1 4.48e-1 1.32e-1 0.053 7.37e-1 -0.398 2.91
## 27 X_Spectral… 400 2.58e+3 2.55e+3 8.64e+2 607. 5.33e+3 0.224 2.85
## 28 X_Spectral… 400 3.08e+3 3.15e+3 7.68e+2 815. 4.72e+3 -0.259 2.56
## 29 X_Spectral… 400 1.87e+0 1.69e+0 8.82e-1 0.39 7.86e+0 2.27 12.0
## 30 X_Spectral… 400 7.35e+0 5.22e+0 8.62e+0 1.93 1.22e+2 7.94 91.5
## 31 X_Spectral… 400 4.85e-2 4.7 e-2 2.65e-2 0.006 2.09e-1 1.48 8.09
## 32 X_Entropyo… 400 8.73e-1 8.79e-1 3.73e-2 0.74 9.42e-1 -0.972 3.93
## 33 X_Chromagr… 400 3.53e-1 2.74e-1 3.23e-1 0 1 e+0 0.700 2.27
## 34 X_Chromagr… 400 2.53e-1 1.42e-1 2.88e-1 0 1 e+0 1.20 3.41
## 35 X_Chromagr… 400 3.65e-1 2.88e-1 3.25e-1 0 1 e+0 0.691 2.27
## 36 X_Chromagr… 400 2.08e-1 1.05e-1 2.54e-1 0 1 e+0 1.48 4.44
## 37 X_Chromagr… 400 3.50e-1 2.71e-1 3.04e-1 0 1 e+0 0.781 2.53
## 38 X_Chromagr… 400 2.64e-1 1.44e-1 2.93e-1 0 1 e+0 1.11 3.17
## 39 X_Chromagr… 400 2.43e-1 1.41e-1 2.76e-1 0 1 e+0 1.33 3.90
## 40 X_Chromagr… 400 3.92e-1 2.95e-1 3.31e-1 0 1 e+0 0.570 2.04
## 41 X_Chromagr… 400 3.55e-1 2.47e-1 3.35e-1 0 1 e+0 0.759 2.22
## 42 X_Chromagr… 400 5.91e-1 6.12e-1 3.58e-1 0 1 e+0 -0.218 1.56
## 43 X_Chromagr… 400 3.42e-1 2.47e-1 3.16e-1 0 1 e+0 0.711 2.30
## 44 X_Chromagr… 400 3.86e-1 2.96e-1 3.48e-1 0 1 e+0 0.546 1.88
## 45 X_Harmonic… 400 3.28e-1 3.33e-1 5.55e-2 0.112 4.88e-1 -0.464 3.47
## 46 X_Harmonic… 400 1.93e-1 1.9 e-1 4.71e-2 0.06 3.4 e-1 0.244 2.87
## 47 X_Harmonic… 400 -1.58e-4 -2 e-3 1.05e-1 -0.285 4.42e-1 0.200 3.90
## 48 X_Harmonic… 400 1.76e+0 1.68e+0 9.30e-1 0.187 4.49e+0 0.397 2.70
## 49 X_Harmonic… 400 7.70e-1 7.86e-1 7.21e-2 0.53 9.08e-1 -0.761 3.16
## 50 X_Harmonic… 400 9.67e-1 9.67e-1 3.84e-3 0.939 9.77e-1 -1.48 10.5
X_scaled <- scale(X)
pca_result <- prcomp(X_scaled, center = FALSE, scale. = FALSE)
var_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)
n_pc_80 <- which(cumsum(var_explained) >= 0.80)
X_pca <- pca_result$x[, 1:n_pc_80]
## Warning in 1:n_pc_80: numerical expression has 29 elements: only the first used
cat("Original dimensions:", ncol(X_scaled), "\n")
## Original dimensions: 50
cat("PCA dimensions :", ncol(X_pca), "\n")
## PCA dimensions : 22
set.seed(42)
hop_original <- get_clust_tendency(X_scaled, n = 100, graph = FALSE)
## Warning: Hopkins statistic uses the corrected formula (Wright 2022); results
## differ from legacy factoextra. Set options(factoextra.warn_hopkins = FALSE) to
## silence this warning.
cat("Hopkins Statistic (Original):", round(hop_original$hopkins_stat, 4), "\n")
## Hopkins Statistic (Original): 1
hop_pca <- get_clust_tendency(X_pca, n = 100, graph = FALSE)
cat("Hopkins Statistic (PCA) :", round(hop_pca$hopkins_stat, 4), "\n")
## Hopkins Statistic (PCA) : 1
kNNdistplot(X_pca, k = 23)
abline(h = 8, col = "red", lty = 2)
set.seed(42)
# Elbow Method
fviz_nbclust(X_pca, kmeans, method = "wss") +
labs(title = "Elbow Method untuk Optimal k")
# Silhouette Method
fviz_nbclust(X_pca, kmeans, method = "silhouette") +
labs(title = "Silhouette Method untuk Optimal k")
# Berdasarkan hasil eksplorasi, dipilih k optimal = 2
k_optimal <- 2
set.seed(42)
kmeans_res <- kmeans(X_pca, centers = k_optimal, nstart = 25)
kmeans_res
## K-means clustering with 2 clusters of sizes 231, 169
##
## Cluster means:
## PC1 PC2 PC3 PC4 PC5 PC6
## 1 1.936972 -0.05734675 -0.1078317 -0.04573734 -0.02410952 -0.07261830
## 2 -2.647577 0.07838521 0.1473912 0.06251672 0.03295443 0.09925933
## PC7 PC8 PC9 PC10 PC11 PC12
## 1 0.05040918 0.03321966 -0.01913624 -0.03111916 -0.03432697 -0.02435921
## 2 -0.06890249 -0.04540676 0.02615663 0.04253565 0.04692029 0.03329573
## PC13 PC14 PC15 PC16 PC17 PC18
## 1 -0.008972421 0.02160526 0.02161278 -0.03202798 -0.02217016 0.04579152
## 2 0.012264079 -0.02953144 -0.02954172 0.04377790 0.03030359 -0.06259078
## PC19 PC20 PC21 PC22
## 1 0.01398905 0.08423306 0.09094821 0.05117816
## 2 -0.01912113 -0.11513513 -0.12431382 -0.06995358
##
## Clustering vector:
## [1] 2 2 1 2 2 2 2 2 2 2 2 1 2 2 1 1 2 1 2 1 1 1 2 1 1 2 2 2 1 2 2 2 2 2 2 2 1
## [38] 1 1 2 2 1 2 1 1 2 2 1 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 1 1 1 1 1 2
## [75] 1 2 2 2 1 1 2 1 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 2 1 2 2 1 1 2 1 2 2 2 1
## [223] 1 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 1 2 2 1 2 2 1 2 2 2 2 2 1 1 2 2 1 2 2 1
## [260] 1 2 2 2 2 1 1 1 2 2 2 1 2 2 2 2 2 2 2 1 1 1 1 2 2 2 1 2 2 1 2 2 2 1 2 1 1
## [297] 2 2 1 2 1 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 2 2 1 1 1 1 1 2 1 1 2 1 1 2
## [334] 2 2 1 2 1 1 2 1 1 2 1 1 1 1 2 1 2 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 2 2 2 2
## [371] 1 1 2 2 2 1 2 1 1 2 1 1 1 2 1 2 1 2 2 1 2 1 1 2 1 1 1 1 1 2
##
## Within cluster sum of squares by cluster:
## [1] 7309.459 6766.697
## (between_SS / total_SS = 12.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
table(Predicted = kmeans_res$cluster, Actual = data$Class)
## Actual
## Predicted angry happy relax sad
## 1 63 98 35 35
## 2 37 2 65 65
ari_kmeans <- adjustedRandIndex(kmeans_res$cluster, data$Class_Numeric)
cat("K-Means ARI Score (PCA):", round(ari_kmeans, 4), "\n")
## K-Means ARI Score (PCA): 0.1294
pca_kmeans_viz <- data.frame(
PC1 = pca_result$x[, 1],
PC2 = pca_result$x[, 2],
Cluster = as.factor(kmeans_res$cluster)
)
ggplot(pca_kmeans_viz, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point(alpha = 0.6, size = 2) +
labs(title = paste("K-Means Clustering (k =", k_optimal, ") in PCA Space"),
x = "PC1", y = "PC2") +
theme_minimal()
df <- as.data.frame(X_pca)
set.seed(42)
kmed_res <- kcca(df, k = 3, family = kccaFamily("kmedians"))
kmed_res
## kcca object of family 'kmedians'
##
## call:
## kcca(x = df, k = 3, family = kccaFamily("kmedians"))
##
## cluster sizes:
##
## 1 2 3
## 135 104 161
kmed_cluster <- as.vector(clusters(kmed_res))
table(kmed_cluster)
## kmed_cluster
## 1 2 3
## 135 104 161
table(Predicted = kmed_cluster, Actual = data$Class)
## Actual
## Predicted angry happy relax sad
## 1 53 9 29 44
## 2 6 0 59 39
## 3 41 91 12 17
ari_kmed <- adjustedRandIndex(kmed_cluster, data$Class_Numeric)
cat("K-Medians ARI Score (PCA):", round(ari_kmed, 4), "\n")
## K-Medians ARI Score (PCA): 0.2135
pca_kmed_viz <- data.frame(
PC1 = pca_result$x[, 1],
PC2 = pca_result$x[, 2],
Cluster = as.factor(kmed_cluster)
)
ggplot(pca_kmed_viz, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point(alpha = 0.6, size = 2) +
labs(title = "K-Medians Clustering (k = 3) in PCA Space",
x = "PC1", y = "PC2") +
theme_minimal()
sil_kmed <- silhouette(kmed_cluster, dist(df))
avg_sil_kmed <- mean(sil_kmed[, 3])
cat("Average Silhouette Score (K-Medians):", round(avg_sil_kmed, 4), "\n")
## Average Silhouette Score (K-Medians): 0.0825
set.seed(42)
db_res <- dbscan(X_pca, eps = 12, minPts = 23)
db_res
## DBSCAN clustering for 400 objects.
## Parameters: eps = 12, minPts = 23
## Using euclidean distances and borderpoints = TRUE
## The clustering contains 1 cluster(s) and 1 noise points.
##
## 0 1
## 1 399
##
## Available fields: cluster, eps, minPts, metric, borderPoints
db_res$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
table(Predicted = db_res$cluster, Actual = data$Class)
## Actual
## Predicted angry happy relax sad
## 0 0 0 0 1
## 1 100 100 100 99
ari_dbscan <- adjustedRandIndex(db_res$cluster, data$Class_Numeric)
ari_dbscan
## [1] 0
DBSCAN gagal melakukan clustering pada 4 kelas. DBSCAN hanya menangkap 1 cluster. Ini terjadi karena nilai yang berdekatan yang dapat dilihat pada gambar dibawah ini,
pca_dbscan_viz <- data.frame(
PC1 = pca_result$x[, 1],
PC2 = pca_result$x[, 2],
Cluster = as.factor(db_res$cluster)
)
ggplot(pca_dbscan_viz, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point(alpha = 0.6, size = 2) +
labs(title = "DBSCAN Results in PCA Space (0 = Noise)",
x = "PC1", y = "PC2") +
theme_minimal()
set.seed(42)
ms_res_pca <- meanShift(X_pca)
cat("Number of clusters found:", length(unique(ms_res_pca$assignment)), "\n")
## Number of clusters found: 381
# Visualize Mean Shift results in PCA space
pca_ms_viz <- data.frame(
PC1 = pca_result$x[, 1],
PC2 = pca_result$x[, 2],
Cluster = as.factor(ms_res_pca$assignment)
)
ggplot(pca_ms_viz, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point(alpha = 0.6, size = 2) +
labs(title = "Mean Shift Results in PCA Space",
x = "PC1", y = "PC2") +
theme_minimal() +
theme(legend.position = "none")
ari_ms <- adjustedRandIndex(ms_res_pca$assignment, data$Class)
cat("Mean Shift ARI Score (PCA):", round(ari_ms, 4), "\n")
## Mean Shift ARI Score (PCA): 0.0014
# --- Fuzzy C-Means ---
df <- as.data.frame(X_pca)
set.seed(42)
fcm_res <- cmeans(df, centers = 3, m = 2) # m = 2 adalah parameter fuzziness
fcm_cluster <- apply(fcm_res$membership, 1, which.max)
table(fcm_cluster)
## fcm_cluster
## 1 3
## 186 214
table(Predicted = fcm_cluster, Actual = data$Class)
## Actual
## Predicted angry happy relax sad
## 1 42 4 72 68
## 3 58 96 28 32
ari_fcm <- adjustedRandIndex(fcm_cluster, data$Class_Numeric)
cat("Fuzzy C-Means ARI Score (PCA):", round(ari_fcm, 4), "\n")
## Fuzzy C-Means ARI Score (PCA): 0.1434
pca_fcm_viz <- data.frame(
PC1 = pca_result$x[, 1],
PC2 = pca_result$x[, 2],
Cluster = as.factor(fcm_cluster)
)
ggplot(pca_fcm_viz, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point(alpha = 0.6, size = 2) +
labs(title = "Fuzzy C-Means Clustering (k = 3) in PCA Space",
x = "PC1", y = "PC2") +
theme_minimal()
membership_df <- data.frame(
PC1 = pca_result$x[, 1],
PC2 = pca_result$x[, 2],
Membership = fcm_res$membership[, 1]
)
ggplot(membership_df, aes(x = PC1, y = PC2, color = Membership)) +
geom_point(size = 2) +
scale_color_gradient(low = "blue", high = "red") +
labs(title = "Membership Degree for Cluster 1",
x = "PC1", y = "PC2") +
theme_minimal()
# Hitung distance matrix (pakai PCA data)
dist_matrix <- dist(X_pca)
# --- K-Means ---
sil_kmeans <- silhouette(kmeans_res$cluster, dist_matrix)
avg_sil_kmeans <- mean(sil_kmeans[, 3])
# --- K-Medians ---
sil_kmed <- silhouette(kmed_cluster, dist_matrix)
avg_sil_kmed <- mean(sil_kmed[, 3])
# --- DBSCAN ---
db_cluster <- db_res$cluster
db_valid <- db_cluster != 0
if(length(unique(db_cluster[db_valid])) > 1){
sil_db <- silhouette(db_cluster[db_valid], dist_matrix[db_valid, db_valid])
avg_sil_db <- mean(sil_db[, 3])
} else {
avg_sil_db <- NA
}
# --- Mean Shift ---
sil_ms <- silhouette(ms_res_pca$assignment, dist_matrix)
avg_sil_ms <- mean(sil_ms[, 3])
# --- Fuzzy C-Means ---
sil_fcm <- silhouette(fcm_cluster, dist_matrix)
avg_sil_fcm <- mean(sil_fcm[, 3])
# Gabungkan ke tabel
sil_summary <- data.frame(
Method = c("K-Means", "K-Medians", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
Silhouette_Score = c(
avg_sil_kmeans,
avg_sil_kmed,
avg_sil_db,
avg_sil_ms,
avg_sil_fcm
)
)
# Urutkan dari terbaik
sil_summary <- sil_summary[order(-sil_summary$Silhouette_Score), ]
print(sil_summary)
## Method Silhouette_Score
## 1 K-Means 0.1221781
## 5 Fuzzy C-Means 0.1160965
## 4 Mean Shift 0.0904467
## 2 K-Medians 0.0825339
## 3 DBSCAN NA
ggplot(sil_summary, aes(x = reorder(Method, Silhouette_Score), y = Silhouette_Score, fill = Method)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
labs(title = "Perbandingan Silhouette Score Semua Metode",
x = "Metode",
y = "Average Silhouette Score") +
theme_minimal() +
geom_text(aes(label = round(Silhouette_Score, 3)), hjust = -0.1)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).
df_matrix <- as.matrix(X_pca)
# --- K-Means ---
dunn_kmeans <- index.DB(df_matrix, kmeans_res$cluster)$DB
# --- K-Medians ---
dunn_kmed <- index.DB(df_matrix, kmed_cluster)$DB
# --- DBSCAN ---
db_cluster <- db_res$cluster
db_valid <- db_cluster != 0
if(length(unique(db_cluster[db_valid])) > 1){
dunn_db <- index.DB(df_matrix[db_valid, ], db_cluster[db_valid])$DB
} else {
dunn_db <- NA
}
# --- Mean Shift ---
dunn_ms <- index.DB(df_matrix, ms_res_pca$assignment)$DB
# --- Fuzzy C-Means ---
dunn_fcm <- index.DB(df_matrix, fcm_cluster)$DB
## Warning in max(R[i, ][is.finite(R[i, ])]): no non-missing arguments to max;
## returning -Inf
# Gabungkan ke tabel
dunn_summary <- data.frame(
Method = c("K-Means", "K-Medians", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
Dunn_Index = c(
dunn_kmeans,
dunn_kmed,
dunn_db,
dunn_ms,
dunn_fcm
)
)
# Urutkan dari terbaik (higher is better)
dunn_summary <- dunn_summary[order(-dunn_summary$Dunn_Index), ]
print(dunn_summary)
## Method Dunn_Index
## 2 K-Medians 3.44366080
## 5 Fuzzy C-Means 2.61553444
## 1 K-Means 2.58918077
## 4 Mean Shift 0.06002086
## 3 DBSCAN NA
ggplot(dunn_summary, aes(x = reorder(Method, Dunn_Index), y = Dunn_Index, fill = Method)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
labs(title = "Perbandingan Dunn Index Semua Metode",
x = "Metode",
y = "Dunn Index") +
theme_minimal() +
geom_text(aes(label = round(Dunn_Index, 3)), hjust = -0.1)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).
ari_summary <- data.frame(
Method = c(
"K-Means",
"K-Medians",
"DBSCAN",
"Mean Shift",
"Fuzzy C-Means"
),
ARI_Score = c(
ari_kmeans,
ari_kmed,
ari_dbscan,
ari_ms,
ari_fcm
)
)
# Urutkan dari terbaik
ari_summary <- ari_summary[order(-ari_summary$ARI_Score), ]
print(ari_summary)
## Method ARI_Score
## 2 K-Medians 0.213518389
## 5 Fuzzy C-Means 0.143394141
## 1 K-Means 0.129388875
## 4 Mean Shift 0.001442304
## 3 DBSCAN 0.000000000
# Visualize ARI scores
ggplot(ari_summary, aes(x = reorder(Method, ARI_Score), y = ARI_Score, fill = Method)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
labs(title = "ARI Score Comparison",
x = "Clustering Method",
y = "Adjusted Rand Index") +
theme_minimal() +
geom_text(aes(label = round(ARI_Score, 3)), hjust = -0.1)