Library
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.4.3
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(LPCM)
## Warning: package 'LPCM' was built under R version 4.4.3
##
## Attaching package: 'LPCM'
##
## The following object is masked from 'package:lubridate':
##
## ms
library(ggplot2)
library(cluster)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
Dataset
spotify <- read.csv("dataset.csv")
str(spotify)
## 'data.frame': 114000 obs. of 21 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ track_id : chr "5SuOikwiRyPMVoIQDJUgSV" "4qPNDBW1i3p13qLCt0Ki3A" "1iJBSr7s7jYXzM8EGcbK5b" "6lfxq3CG4xtTiEg7opyCyx" ...
## $ artists : chr "Gen Hoshino" "Ben Woodward" "Ingrid Michaelson;ZAYN" "Kina Grannis" ...
## $ album_name : chr "Comedy" "Ghost (Acoustic)" "To Begin Again" "Crazy Rich Asians (Original Motion Picture Soundtrack)" ...
## $ track_name : chr "Comedy" "Ghost - Acoustic" "To Begin Again" "Can't Help Falling In Love" ...
## $ popularity : int 73 55 57 71 82 58 74 80 74 56 ...
## $ duration_ms : int 230666 149610 210826 201933 198853 214240 229400 242946 189613 205594 ...
## $ explicit : chr "False" "False" "False" "False" ...
## $ danceability : num 0.676 0.42 0.438 0.266 0.618 0.688 0.407 0.703 0.625 0.442 ...
## $ energy : num 0.461 0.166 0.359 0.0596 0.443 0.481 0.147 0.444 0.414 0.632 ...
## $ key : int 1 1 0 0 2 6 2 11 0 1 ...
## $ loudness : num -6.75 -17.23 -9.73 -18.52 -9.68 ...
## $ mode : int 0 1 1 1 1 1 1 1 1 1 ...
## $ speechiness : num 0.143 0.0763 0.0557 0.0363 0.0526 0.105 0.0355 0.0417 0.0369 0.0295 ...
## $ acousticness : num 0.0322 0.924 0.21 0.905 0.469 0.289 0.857 0.559 0.294 0.426 ...
## $ instrumentalness: num 1.01e-06 5.56e-06 0.00 7.07e-05 0.00 0.00 2.89e-06 0.00 0.00 4.19e-03 ...
## $ liveness : num 0.358 0.101 0.117 0.132 0.0829 0.189 0.0913 0.0973 0.151 0.0735 ...
## $ valence : num 0.715 0.267 0.12 0.143 0.167 0.666 0.0765 0.712 0.669 0.196 ...
## $ tempo : num 87.9 77.5 76.3 181.7 119.9 ...
## $ time_signature : int 4 4 4 3 4 4 3 4 4 4 ...
## $ track_genre : chr "acoustic" "acoustic" "acoustic" "acoustic" ...
sum(is.na(spotify))
## [1] 0
Seleksi fitur numerik untuk clustering
features <- spotify %>%
select(danceability, energy, acousticness, instrumentalness,
liveness, valence, tempo, speechiness)
Bersihkan data
features <- na.omit(features)
Subset 5000 data acak
set.seed(42)
subset_idx <- sample(1:nrow(features), 5000)
features_subset <- features[subset_idx, ]
Standarisasi
scaled_data <- scale(features_subset)
z_scores <- scale(scaled_data)
outliers <- apply(z_scores, 1, function(x) any(abs(x) > 3))
scaled_data <- scaled_data[!outliers, ]
PCA untuk reduksi dimensi
pca <- prcomp(scaled_data)
pca_data <- as.data.frame(pca$x[, 1:4])
colnames(pca_data) <- paste("PC", 1:4, sep = "")
head(pca_data)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.4730 1.1913 0.9668 0.8921 0.76272 0.68075 0.59148
## Proportion of Variance 0.3134 0.2050 0.1350 0.1149 0.08403 0.06694 0.05053
## Cumulative Proportion 0.3134 0.5184 0.6534 0.7683 0.85238 0.91932 0.96985
## PC8
## Standard deviation 0.45687
## Proportion of Variance 0.03015
## Cumulative Proportion 1.00000
pca$rotation
## PC1 PC2 PC3 PC4 PC5
## danceability 0.34837808 -0.49503303 -0.35086478 0.307824982 -0.17736740
## energy 0.55930356 0.34887243 -0.12660032 -0.135152673 0.10871890
## acousticness -0.50468653 -0.40491772 0.28815317 0.144129406 0.21048681
## instrumentalness -0.30019664 0.34575281 -0.58456016 0.580542525 0.32310324
## liveness 0.07000019 0.10059726 0.15371951 -0.234411998 0.79245721
## valence 0.41316158 -0.47452825 0.08492435 0.277356667 0.40583848
## tempo 0.18614003 0.33698418 0.63648635 0.630315033 -0.11958676
## speechiness 0.10361036 0.02952551 -0.01291209 -0.001371597 0.03203607
## PC6 PC7 PC8
## danceability -0.5753496617 -0.16232312 -0.1639434966
## energy 0.1702888530 0.12692895 -0.6886542133
## acousticness -0.0004927042 0.20547778 -0.6253011713
## instrumentalness 0.0799140990 0.02853030 0.0009052951
## liveness -0.4768987560 -0.20483748 0.0949854991
## valence 0.5424988597 0.07115788 0.2364492699
## tempo -0.1726911467 -0.07224225 0.0062829498
## speechiness -0.2864058292 0.92858191 0.2071365547
rotation <- pca$rotation[, 1:4]
round(pca$rotation[, 1:4], 4)
## PC1 PC2 PC3 PC4
## danceability 0.3484 -0.4950 -0.3509 0.3078
## energy 0.5593 0.3489 -0.1266 -0.1352
## acousticness -0.5047 -0.4049 0.2882 0.1441
## instrumentalness -0.3002 0.3458 -0.5846 0.5805
## liveness 0.0700 0.1006 0.1537 -0.2344
## valence 0.4132 -0.4745 0.0849 0.2774
## tempo 0.1861 0.3370 0.6365 0.6303
## speechiness 0.1036 0.0295 -0.0129 -0.0014
fviz_pca_biplot(pca,
geom.ind = "point",
label = "var",
repel = TRUE,
col.var = "blue",
title = "PCA Biplot - Eksplorasi Awal (Sebelum Clustering)")
k_range <- 1:10
inertia_values <- numeric(length(k_range))
for (k in k_range) {
kmeans_res <- kmeans(scaled_data, centers = k)
inertia_values[k] <- kmeans_res$tot.withinss
}
plot(k_range, inertia_values, type = "b", pch = 19, col = "blue",
xlab = "Jumlah Klaster (k)", ylab = "Inertia",
main = "Elbow Method for Optimal K")
KMeans Clustering
set.seed(123)
kmeans_res <- kmeans(scaled_data, centers = 5, nstart = 25)
## Warning: did not converge in 10 iterations
pca_data$kmeans <- as.factor(kmeans_res$cluster)
DBSCAN Clustering
dbscan_res <- dbscan(scaled_data, eps = 2.4, minPts = 4)
pca_data$dbscan <- as.factor(dbscan_res$cluster)
MeanShift Clustering (pakai LPCM)
meanshift_res <- ms(scaled_data, h = 0.3)
pca_data$meanshift <- as.factor(meanshift_res$cluster.label)
plot_cluster <- function(df, cluster_col, title) {
ggplot(df, aes_string(x = "PC1", y = "PC2", color = cluster_col)) +
geom_point(size = 2, alpha = 0.7) +
labs(title = title, color = "Cluster") +
theme_minimal() +
scale_color_brewer(palette = "Set1")
}
Visualisasi hasil
plot_cluster(pca_data, "kmeans", "KMeans Clustering")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot_cluster(pca_data, "dbscan", "DBSCAN Clustering")
plot_cluster(pca_data, "meanshift", "MeanShift Clustering")
# Biplot dengan warna berdasarkan hasil clustering (misal KMeans)
fviz_pca_biplot(pca,
geom.ind = "point",
habillage = pca_data$kmeans, # Bisa diganti cluster lain
addEllipses = TRUE,
label = "var",
repel = TRUE,
col.var = "black",
palette = "Set1",
title = "PCA Biplot dengan Hasil Clustering (KMeans)")
fviz_pca_biplot(pca,
geom.ind = "point",
habillage = pca_data$dbscan, # warna berdasarkan cluster DBSCAN
addEllipses = TRUE,
label = "var",
repel = TRUE,
col.var = "black",
palette = "Set1",
title = "PCA Biplot dengan Hasil Clustering (DBSCAN)")
## Too few points to calculate an ellipse
sil_kmeans <- silhouette(as.numeric(pca_data$kmeans), dist(scaled_data))
mean_sil_kmeans <- mean(sil_kmeans[, 3])
cat("Silhouette Score - KMeans:", mean_sil_kmeans, "\n")
## Silhouette Score - KMeans: 0.204077
if (length(unique(pca_data$dbscan)) > 1) {
sil_dbscan <- silhouette(as.numeric(pca_data$dbscan), dist(scaled_data))
mean_sil_dbscan <- mean(sil_dbscan[, 3])
cat("Silhouette Score - DBSCAN:", mean_sil_dbscan, "\n")
} else {
cat("DBSCAN hanya menghasilkan satu cluster atau noise semua\n")
}
## Silhouette Score - DBSCAN: 0.3329218
if (length(unique(pca_data$meanshift)) > 1) {
sil_meanshift <- silhouette(as.numeric(pca_data$meanshift), dist(scaled_data))
mean_sil_meanshift <- mean(sil_meanshift[, 3])
cat("Silhouette Score - MeanShift:", mean_sil_meanshift, "\n")
} else {
cat("MeanShift hanya menghasilkan satu cluster\n")
}
## Silhouette Score - MeanShift: 0.3343254
Menghitung statistik deskriptif untuk KMeans
kmeans_stats <- pca_data %>%
select_if(is.numeric) %>%
bind_cols(kmeans = pca_data$kmeans) %>%
group_by(kmeans) %>%
summarise(across(everything(), list(mean = ~mean(.),
sd = ~sd(.),
min = ~min(.),
max = ~max(.)),
.names = "{col}_{fn}"))
print(kmeans_stats)
## # A tibble: 5 × 17
## kmeans PC1_mean PC1_sd PC1_min PC1_max PC2_mean PC2_sd PC2_min PC2_max
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 -0.195 0.747 -2.31 1.60 1.45 1.04 -1.01 4.38
## 2 2 -3.39 0.928 -5.13 -1.23 0.360 0.914 -1.80 2.69
## 3 3 -1.40 0.806 -3.89 0.103 -0.795 0.734 -2.87 1.41
## 4 4 0.648 0.646 -1.86 2.33 0.999 0.680 -0.190 4.03
## 5 5 1.06 0.650 -0.846 2.65 -0.818 0.624 -3.22 0.667
## # ℹ 8 more variables: PC3_mean <dbl>, PC3_sd <dbl>, PC3_min <dbl>,
## # PC3_max <dbl>, PC4_mean <dbl>, PC4_sd <dbl>, PC4_min <dbl>, PC4_max <dbl>
Statistik deskriptif untuk DBSCAN
dbscan_stats <- pca_data %>%
select_if(is.numeric) %>%
bind_cols(dbscan = pca_data$dbscan) %>%
group_by(dbscan) %>%
summarise(across(everything(), list(mean = ~mean(.),
sd = ~sd(.),
min = ~min(.),
max = ~max(.)),
.names = "{col}_{fn}"))
print(dbscan_stats)
## # A tibble: 2 × 17
## dbscan PC1_mean PC1_sd PC1_min PC1_max PC2_mean PC2_sd PC2_min PC2_max
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 -2.41 1.52 -4.13 -1.23 0.592 1.00 -0.0438 1.75
## 2 1 0.00152 1.47 -5.13 2.65 -0.000373 1.19 -3.22 4.38
## # ℹ 8 more variables: PC3_mean <dbl>, PC3_sd <dbl>, PC3_min <dbl>,
## # PC3_max <dbl>, PC4_mean <dbl>, PC4_sd <dbl>, PC4_min <dbl>, PC4_max <dbl>
Statistik deskriptif untuk MeanShift
meanshift_stats <- pca_data %>%
select_if(is.numeric) %>%
bind_cols(meanshift = pca_data$meanshift) %>%
group_by(meanshift) %>%
summarise(across(everything(), list(mean = ~mean(.),
sd = ~sd(.),
min = ~min(.),
max = ~max(.)),
.names = "{col}_{fn}"))
print(meanshift_stats)
## # A tibble: 2 × 17
## meanshift PC1_mean PC1_sd PC1_min PC1_max PC2_mean PC2_sd PC2_min PC2_max
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.164 1.26 -4.56 2.65 -0.0196 1.20 -3.22 4.38
## 2 2 -3.91 0.605 -5.13 -2.44 0.467 0.771 -1.29 2.57
## # ℹ 8 more variables: PC3_mean <dbl>, PC3_sd <dbl>, PC3_min <dbl>,
## # PC3_max <dbl>, PC4_mean <dbl>, PC4_sd <dbl>, PC4_min <dbl>, PC4_max <dbl>