Library

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.4.3
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(LPCM)
## Warning: package 'LPCM' was built under R version 4.4.3
## 
## Attaching package: 'LPCM'
## 
## The following object is masked from 'package:lubridate':
## 
##     ms
library(ggplot2)
library(cluster)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

Dataset

spotify <- read.csv("dataset.csv")
str(spotify)
## 'data.frame':    114000 obs. of  21 variables:
##  $ X               : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ track_id        : chr  "5SuOikwiRyPMVoIQDJUgSV" "4qPNDBW1i3p13qLCt0Ki3A" "1iJBSr7s7jYXzM8EGcbK5b" "6lfxq3CG4xtTiEg7opyCyx" ...
##  $ artists         : chr  "Gen Hoshino" "Ben Woodward" "Ingrid Michaelson;ZAYN" "Kina Grannis" ...
##  $ album_name      : chr  "Comedy" "Ghost (Acoustic)" "To Begin Again" "Crazy Rich Asians (Original Motion Picture Soundtrack)" ...
##  $ track_name      : chr  "Comedy" "Ghost - Acoustic" "To Begin Again" "Can't Help Falling In Love" ...
##  $ popularity      : int  73 55 57 71 82 58 74 80 74 56 ...
##  $ duration_ms     : int  230666 149610 210826 201933 198853 214240 229400 242946 189613 205594 ...
##  $ explicit        : chr  "False" "False" "False" "False" ...
##  $ danceability    : num  0.676 0.42 0.438 0.266 0.618 0.688 0.407 0.703 0.625 0.442 ...
##  $ energy          : num  0.461 0.166 0.359 0.0596 0.443 0.481 0.147 0.444 0.414 0.632 ...
##  $ key             : int  1 1 0 0 2 6 2 11 0 1 ...
##  $ loudness        : num  -6.75 -17.23 -9.73 -18.52 -9.68 ...
##  $ mode            : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ speechiness     : num  0.143 0.0763 0.0557 0.0363 0.0526 0.105 0.0355 0.0417 0.0369 0.0295 ...
##  $ acousticness    : num  0.0322 0.924 0.21 0.905 0.469 0.289 0.857 0.559 0.294 0.426 ...
##  $ instrumentalness: num  1.01e-06 5.56e-06 0.00 7.07e-05 0.00 0.00 2.89e-06 0.00 0.00 4.19e-03 ...
##  $ liveness        : num  0.358 0.101 0.117 0.132 0.0829 0.189 0.0913 0.0973 0.151 0.0735 ...
##  $ valence         : num  0.715 0.267 0.12 0.143 0.167 0.666 0.0765 0.712 0.669 0.196 ...
##  $ tempo           : num  87.9 77.5 76.3 181.7 119.9 ...
##  $ time_signature  : int  4 4 4 3 4 4 3 4 4 4 ...
##  $ track_genre     : chr  "acoustic" "acoustic" "acoustic" "acoustic" ...
sum(is.na(spotify))
## [1] 0

Seleksi fitur numerik untuk clustering

features <- spotify %>%
  select(danceability, energy, acousticness, instrumentalness,
         liveness, valence, tempo, speechiness)

Bersihkan data

features <- na.omit(features)

Subset 5000 data acak

set.seed(42)
subset_idx <- sample(1:nrow(features), 5000)
features_subset <- features[subset_idx, ]

Standarisasi

scaled_data <- scale(features_subset)
z_scores <- scale(scaled_data)
outliers <- apply(z_scores, 1, function(x) any(abs(x) > 3))
scaled_data <- scaled_data[!outliers, ]

PCA untuk reduksi dimensi

pca <- prcomp(scaled_data)
pca_data <- as.data.frame(pca$x[, 1:4])
colnames(pca_data) <- paste("PC", 1:4, sep = "")


head(pca_data)
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.4730 1.1913 0.9668 0.8921 0.76272 0.68075 0.59148
## Proportion of Variance 0.3134 0.2050 0.1350 0.1149 0.08403 0.06694 0.05053
## Cumulative Proportion  0.3134 0.5184 0.6534 0.7683 0.85238 0.91932 0.96985
##                            PC8
## Standard deviation     0.45687
## Proportion of Variance 0.03015
## Cumulative Proportion  1.00000
pca$rotation
##                          PC1         PC2         PC3          PC4         PC5
## danceability      0.34837808 -0.49503303 -0.35086478  0.307824982 -0.17736740
## energy            0.55930356  0.34887243 -0.12660032 -0.135152673  0.10871890
## acousticness     -0.50468653 -0.40491772  0.28815317  0.144129406  0.21048681
## instrumentalness -0.30019664  0.34575281 -0.58456016  0.580542525  0.32310324
## liveness          0.07000019  0.10059726  0.15371951 -0.234411998  0.79245721
## valence           0.41316158 -0.47452825  0.08492435  0.277356667  0.40583848
## tempo             0.18614003  0.33698418  0.63648635  0.630315033 -0.11958676
## speechiness       0.10361036  0.02952551 -0.01291209 -0.001371597  0.03203607
##                            PC6         PC7           PC8
## danceability     -0.5753496617 -0.16232312 -0.1639434966
## energy            0.1702888530  0.12692895 -0.6886542133
## acousticness     -0.0004927042  0.20547778 -0.6253011713
## instrumentalness  0.0799140990  0.02853030  0.0009052951
## liveness         -0.4768987560 -0.20483748  0.0949854991
## valence           0.5424988597  0.07115788  0.2364492699
## tempo            -0.1726911467 -0.07224225  0.0062829498
## speechiness      -0.2864058292  0.92858191  0.2071365547
rotation <- pca$rotation[, 1:4]
round(pca$rotation[, 1:4], 4)
##                      PC1     PC2     PC3     PC4
## danceability      0.3484 -0.4950 -0.3509  0.3078
## energy            0.5593  0.3489 -0.1266 -0.1352
## acousticness     -0.5047 -0.4049  0.2882  0.1441
## instrumentalness -0.3002  0.3458 -0.5846  0.5805
## liveness          0.0700  0.1006  0.1537 -0.2344
## valence           0.4132 -0.4745  0.0849  0.2774
## tempo             0.1861  0.3370  0.6365  0.6303
## speechiness       0.1036  0.0295 -0.0129 -0.0014
fviz_pca_biplot(pca, 
                geom.ind = "point",
                label = "var",
                repel = TRUE,
                col.var = "blue",
                title = "PCA Biplot - Eksplorasi Awal (Sebelum Clustering)")

k_range <- 1:10
inertia_values <- numeric(length(k_range))
for (k in k_range) {
  kmeans_res <- kmeans(scaled_data, centers = k)
  inertia_values[k] <- kmeans_res$tot.withinss
}

plot(k_range, inertia_values, type = "b", pch = 19, col = "blue", 
     xlab = "Jumlah Klaster (k)", ylab = "Inertia",
     main = "Elbow Method for Optimal K")

KMeans Clustering

set.seed(123)
kmeans_res <- kmeans(scaled_data, centers = 5, nstart = 25)
## Warning: did not converge in 10 iterations
pca_data$kmeans <- as.factor(kmeans_res$cluster)

DBSCAN Clustering

dbscan_res <- dbscan(scaled_data, eps = 2.4, minPts = 4)
pca_data$dbscan <- as.factor(dbscan_res$cluster)

MeanShift Clustering (pakai LPCM)

meanshift_res <- ms(scaled_data, h = 0.3) 

pca_data$meanshift <- as.factor(meanshift_res$cluster.label)
plot_cluster <- function(df, cluster_col, title) {
  ggplot(df, aes_string(x = "PC1", y = "PC2", color = cluster_col)) +
    geom_point(size = 2, alpha = 0.7) +
    labs(title = title, color = "Cluster") +
    theme_minimal() +
    scale_color_brewer(palette = "Set1")
}

Visualisasi hasil

plot_cluster(pca_data, "kmeans", "KMeans Clustering")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

plot_cluster(pca_data, "dbscan", "DBSCAN Clustering")

plot_cluster(pca_data, "meanshift", "MeanShift Clustering")

# Biplot dengan warna berdasarkan hasil clustering (misal KMeans)
fviz_pca_biplot(pca, 
                geom.ind = "point",
                habillage = pca_data$kmeans, # Bisa diganti cluster lain
                addEllipses = TRUE,
                label = "var",
                repel = TRUE,
                col.var = "black",
                palette = "Set1",
                title = "PCA Biplot dengan Hasil Clustering (KMeans)")

fviz_pca_biplot(pca, 
                geom.ind = "point",
                habillage = pca_data$dbscan,  # warna berdasarkan cluster DBSCAN
                addEllipses = TRUE,
                label = "var",
                repel = TRUE,
                col.var = "black",
                palette = "Set1",
                title = "PCA Biplot dengan Hasil Clustering (DBSCAN)")
## Too few points to calculate an ellipse

sil_kmeans <- silhouette(as.numeric(pca_data$kmeans), dist(scaled_data))
mean_sil_kmeans <- mean(sil_kmeans[, 3])
cat("Silhouette Score - KMeans:", mean_sil_kmeans, "\n")
## Silhouette Score - KMeans: 0.204077
if (length(unique(pca_data$dbscan)) > 1) {
  sil_dbscan <- silhouette(as.numeric(pca_data$dbscan), dist(scaled_data))
  mean_sil_dbscan <- mean(sil_dbscan[, 3])
  cat("Silhouette Score - DBSCAN:", mean_sil_dbscan, "\n")
} else {
  cat("DBSCAN hanya menghasilkan satu cluster atau noise semua\n")
}
## Silhouette Score - DBSCAN: 0.3329218
if (length(unique(pca_data$meanshift)) > 1) {
  sil_meanshift <- silhouette(as.numeric(pca_data$meanshift), dist(scaled_data))
  mean_sil_meanshift <- mean(sil_meanshift[, 3])
  cat("Silhouette Score - MeanShift:", mean_sil_meanshift, "\n")
} else {
  cat("MeanShift hanya menghasilkan satu cluster\n")
}
## Silhouette Score - MeanShift: 0.3343254

Menghitung statistik deskriptif untuk KMeans

kmeans_stats <- pca_data %>%
  select_if(is.numeric) %>%  
  bind_cols(kmeans = pca_data$kmeans) %>%  
  group_by(kmeans) %>%
  summarise(across(everything(), list(mean = ~mean(.), 
                                      sd = ~sd(.), 
                                      min = ~min(.), 
                                      max = ~max(.)),
                   .names = "{col}_{fn}"))
print(kmeans_stats)
## # A tibble: 5 × 17
##   kmeans PC1_mean PC1_sd PC1_min PC1_max PC2_mean PC2_sd PC2_min PC2_max
##   <fct>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>  <dbl>   <dbl>   <dbl>
## 1 1        -0.195  0.747  -2.31    1.60     1.45   1.04   -1.01    4.38 
## 2 2        -3.39   0.928  -5.13   -1.23     0.360  0.914  -1.80    2.69 
## 3 3        -1.40   0.806  -3.89    0.103   -0.795  0.734  -2.87    1.41 
## 4 4         0.648  0.646  -1.86    2.33     0.999  0.680  -0.190   4.03 
## 5 5         1.06   0.650  -0.846   2.65    -0.818  0.624  -3.22    0.667
## # ℹ 8 more variables: PC3_mean <dbl>, PC3_sd <dbl>, PC3_min <dbl>,
## #   PC3_max <dbl>, PC4_mean <dbl>, PC4_sd <dbl>, PC4_min <dbl>, PC4_max <dbl>

Statistik deskriptif untuk DBSCAN

dbscan_stats <- pca_data %>%
  select_if(is.numeric) %>% 
  bind_cols(dbscan = pca_data$dbscan) %>%
  group_by(dbscan) %>%
  summarise(across(everything(), list(mean = ~mean(.), 
                                      sd = ~sd(.), 
                                      min = ~min(.), 
                                      max = ~max(.)),
                   .names = "{col}_{fn}"))
print(dbscan_stats)
## # A tibble: 2 × 17
##   dbscan PC1_mean PC1_sd PC1_min PC1_max  PC2_mean PC2_sd PC2_min PC2_max
##   <fct>     <dbl>  <dbl>   <dbl>   <dbl>     <dbl>  <dbl>   <dbl>   <dbl>
## 1 0      -2.41      1.52   -4.13   -1.23  0.592      1.00 -0.0438    1.75
## 2 1       0.00152   1.47   -5.13    2.65 -0.000373   1.19 -3.22      4.38
## # ℹ 8 more variables: PC3_mean <dbl>, PC3_sd <dbl>, PC3_min <dbl>,
## #   PC3_max <dbl>, PC4_mean <dbl>, PC4_sd <dbl>, PC4_min <dbl>, PC4_max <dbl>

Statistik deskriptif untuk MeanShift

meanshift_stats <- pca_data %>%
  select_if(is.numeric) %>% 
  bind_cols(meanshift = pca_data$meanshift) %>%
  group_by(meanshift) %>%
  summarise(across(everything(), list(mean = ~mean(.), 
                                      sd = ~sd(.), 
                                      min = ~min(.), 
                                      max = ~max(.)),
                   .names = "{col}_{fn}"))
print(meanshift_stats)
## # A tibble: 2 × 17
##   meanshift PC1_mean PC1_sd PC1_min PC1_max PC2_mean PC2_sd PC2_min PC2_max
##   <fct>        <dbl>  <dbl>   <dbl>   <dbl>    <dbl>  <dbl>   <dbl>   <dbl>
## 1 1            0.164  1.26    -4.56    2.65  -0.0196  1.20    -3.22    4.38
## 2 2           -3.91   0.605   -5.13   -2.44   0.467   0.771   -1.29    2.57
## # ℹ 8 more variables: PC3_mean <dbl>, PC3_sd <dbl>, PC3_min <dbl>,
## #   PC3_max <dbl>, PC4_mean <dbl>, PC4_sd <dbl>, PC4_min <dbl>, PC4_max <dbl>