Clustering lagu (track_name) berdasarkan
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v stringr 1.4.0
## v tidyr 1.0.0 v forcats 0.4.0
## v readr 1.3.1
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'readr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'stringr' was built under R version 3.6.2
## Warning: package 'forcats' was built under R version 3.6.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
spotify<- read.csv("SpotifyFeatures.csv")
head(spotify)
## ï..genre artist_name track_name
## 1 Movie Henri Salvador C'est beau de faire un Show
## 2 Movie Martin & les fées Perdu d'avance (par Gad Elmaleh)
## 3 Movie Joseph Williams Don't Let Me Be Lonely Tonight
## 4 Movie Henri Salvador Dis-moi Monsieur Gordon Cooper
## 5 Movie Fabien Nataf Ouverture
## 6 Movie Henri Salvador Le petit souper aux chandelles
## track_id popularity acousticness danceability duration_ms
## 1 0BRjO6ga9RKCKjfDqeFgWV 0 0.611 0.389 99373
## 2 0BjC1NfoEOOusryehmNudP 1 0.246 0.590 137373
## 3 0CoSDzoNIKCRs124s9uTVy 3 0.952 0.663 170267
## 4 0Gc6TVm52BwZD07Ki6tIvf 0 0.703 0.240 152427
## 5 0IuslXpMROHdEPvSl1fTQK 4 0.950 0.331 82625
## 6 0Mf1jKa8eNAf1a4PwTbizj 0 0.749 0.578 160627
## energy instrumentalness key liveness loudness mode speechiness tempo
## 1 0.9100 0.000 C# 0.3460 -1.828 Major 0.0525 166.969
## 2 0.7370 0.000 F# 0.1510 -5.559 Minor 0.0868 174.003
## 3 0.1310 0.000 C 0.1030 -13.879 Minor 0.0362 99.488
## 4 0.3260 0.000 C# 0.0985 -12.178 Major 0.0395 171.758
## 5 0.2250 0.123 F 0.2020 -21.150 Major 0.0456 140.576
## 6 0.0948 0.000 C# 0.1070 -14.970 Major 0.1430 87.479
## time_signature valence
## 1 4/4 0.814
## 2 4/4 0.816
## 3 5/4 0.368
## 4 4/4 0.227
## 5 4/4 0.390
## 6 4/4 0.358
glimpse(spotify)
## Observations: 232,725
## Variables: 18
## $ ï..genre <fct> Movie, Movie, Movie, Movie, Movie, Movie, Movie, M...
## $ artist_name <fct> Henri Salvador, Martin & les fées, Joseph William...
## $ track_name <fct> "C'est beau de faire un Show", "Perdu d'avance (pa...
## $ track_id <fct> 0BRjO6ga9RKCKjfDqeFgWV, 0BjC1NfoEOOusryehmNudP, 0C...
## $ popularity <int> 0, 1, 3, 0, 4, 0, 2, 15, 0, 10, 0, 2, 4, 3, 0, 0, ...
## $ acousticness <dbl> 0.61100, 0.24600, 0.95200, 0.70300, 0.95000, 0.749...
## $ danceability <dbl> 0.389, 0.590, 0.663, 0.240, 0.331, 0.578, 0.703, 0...
## $ duration_ms <int> 99373, 137373, 170267, 152427, 82625, 160627, 2122...
## $ energy <dbl> 0.9100, 0.7370, 0.1310, 0.3260, 0.2250, 0.0948, 0....
## $ instrumentalness <dbl> 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.23e-01, ...
## $ key <fct> C#, F#, C, C#, F, C#, C#, F#, C, G, E, C, F#, D#, ...
## $ liveness <dbl> 0.3460, 0.1510, 0.1030, 0.0985, 0.2020, 0.1070, 0....
## $ loudness <dbl> -1.828, -5.559, -13.879, -12.178, -21.150, -14.970...
## $ mode <fct> Major, Minor, Minor, Major, Major, Major, Major, M...
## $ speechiness <dbl> 0.0525, 0.0868, 0.0362, 0.0395, 0.0456, 0.1430, 0....
## $ tempo <dbl> 166.969, 174.003, 99.488, 171.758, 140.576, 87.479...
## $ time_signature <fct> 4/4, 4/4, 5/4, 4/4, 4/4, 4/4, 4/4, 4/4, 4/4, 4/4, ...
## $ valence <dbl> 0.8140, 0.8160, 0.3680, 0.2270, 0.3900, 0.3580, 0....
# Membuang Variabel yang tidak perlu dan mengambil hanya data numeric agar siap diproses
spot_leveled<- spotify %>%
mutate(popularity=as.numeric(popularity))%>%
group_by(track_name) %>%
summarise(energy=mean(energy),
popularity=mean(popularity),
speechiness=mean(speechiness),
acousticness=mean(acousticness),
instrumentalness=mean(instrumentalness),
tempo=mean(tempo),
danceability=mean(danceability),
liveness=mean(liveness),
valence=mean(valence),
duration_ms=mean(duration_ms),
loudness=mean(loudness)
)
spot_cleaner<- spot_leveled %>%
select(-track_name)
# EDA
GGally::ggcorr(spot_cleaner,label = T)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
Berdasarkan gambar plot di atas dapat disimpulkan fitur audio yang paling berpengaruh dalam popularitas sebuah lagu adala“h”acousticness"
# Melakukan Scalling
sptify_z <- scale(spot_cleaner, center = T, scale = T)
Memilih K secara Random
# k-means with 3 clusters
set.seed(100)
sptify_km <- kmeans(sptify_z, 3)
Hasil k-means 3 cluster:
sptify_km$size # jumlah data tiap cluster
## [1] 38594 100762 9259
sptify_km$centers # letak pusat cluster atau centroid
## energy popularity speechiness acousticness instrumentalness tempo
## 1 -1.2852464 -0.5529941 -0.3809466 1.1631236 0.9275222 -0.3881391
## 2 0.4572420 0.2935443 -0.1755807 -0.5388577 -0.3045946 0.2037304
## 3 0.3812705 -0.8894974 3.4986631 1.0159610 -0.5513799 -0.5992489
## danceability liveness valence duration_ms loudness
## 1 -1.0165082 -0.2830985 -0.9064615 0.07717412 -1.2455042
## 2 0.3761734 -0.1042406 0.3574360 -0.03505573 0.4996075
## 3 0.1433347 2.3144391 -0.1114582 0.05981505 -0.2454322
sptify_km$iter # berapa kali pengulangan sampai menghasilkan kelompok yang stabil
## [1] 3
K OptimumPemilihan K menggunakan “Elbow Method”
# Fungsi manual untuk elbow plot
wss <- function(data, maxCluster = 9) {
# Initialize within sum of squares
RNGkind(sample.kind = "Rounding")
set.seed(50)
SSw <- (nrow(data) - 1) * sum(apply(data, 2, var))
SSw <- vector()
for (i in 2:maxCluster) {
SSw[i] <- sum(kmeans(data, centers = i)$withinss)
}
plot(1:maxCluster, SSw, type = "o", xlab = "Number of Clusters", ylab = "Within groups sum of squares", pch=19)
}
wss(sptify_z)
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 7430750)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 7430750)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 7430750)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 7430750)
Ambil k dimana ketika k ditambah, penurunan nilai total within sum of squares tidak lagi besar (menunjukkan kemiringan yang landai)
k = 4
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(100)
sptify_km2 <- kmeans(sptify_z, 4)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 7430750)
Berikut ini kita akan membandingkan pemilihan nilai cluster secara random (=3) dan menggunakan “Elbow Method” (=4)
Kebaikan hasil clustering dapat dilihat dari 3 nilai:
$withinss): jarak tiap observasi ke centroid untuk tiap cluster -> dikuadratkan -> dijumlahkan$totss): jarak tiap observasi ke global sample mean (rata-rata data keseluruhan).$betweenss): jarak centroid tiap cluster ke global sample mean -> dikuadratkan -> dikali jumlah data tiap cluster -> dijumlahkan# Nilai cluster = 3
sptify_km$withinss
## [1] 319702.17 602536.05 88005.67
sptify_km$betweenss
## [1] 624510.1
sptify_km$totss
## [1] 1634754
# betweenss/totss
sptify_km$betweenss/sptify_km$totss
## [1] 0.3820208
# Nilai cluster = 4
sptify_km2$withinss
## [1] 283812.66 305915.52 258827.68 87936.63
sptify_km2$betweenss
## [1] 698261.5
sptify_km2$totss
## [1] 1634754
# betweenss/totss
sptify_km2$betweenss/sptify_km2$totss
## [1] 0.4271355
Clustering yang baik:
Berdasarkan perhitungan goodness of fit di atas dan juga berdasarkan “Elbow Method” nilai cluster K = 4 merupakan nilai terbaik karena memiliki “withinss” yang lebih rendah dan “betweenss/totss” yang lebih besar
Berikut kita akan mencoba menginterpretasikan hasil clustering.
# memasukkan hasil clustering ke data clean
spot_cleaner$cluster <- sptify_km2$cluster
# Mengelompokkan karakteristik berdasarkan cluster
spot_cleaner %>%
group_by(cluster) %>%
summarise_all(mean)
## # A tibble: 4 x 12
## cluster energy popularity speechiness acousticness instrumentalness tempo
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.703 44.8 0.0909 0.166 0.0890 131.
## 2 2 0.191 26.0 0.0512 0.847 0.493 104.
## 3 3 0.643 36.3 0.0982 0.288 0.0816 115.
## 4 4 0.658 20.4 0.861 0.789 0.00241 98.4
## # ... with 5 more variables: danceability <dbl>, liveness <dbl>, valence <dbl>,
## # duration_ms <dbl>, loudness <dbl>
Karakteristik setiap cluster
*1= high popularity,high energy,low speechiness,low acousticness, high tempo, high loudness
*2= low energy,high acousticness,high instrumentalness, low danceability, low liveness, low valence,high duration_ms
*3= high danceability, high valence,low duration_ms
*4= low popularity, high energy,high speechiness, low instrumentalness, low tempo, high liveness
Berikut adalah step untuk jawaban pertanyaan di atas:
# Memasukkan kembali varable track_name ke data clean yaang sudah memiliki cluster
spot_last<- data.frame(cbind(track_name=spot_leveled$track_name,spot_cleaner))
# Mencari cluster yang memiliki lagu dengan karakteristik yang sama
spot_last[spot_last$track_name == "(I'm Gonna Be) 500 Miles",]
## track_name energy popularity speechiness acousticness
## 416 (I'm Gonna Be) 500 Miles 0.854 47 0.236 0.00169
## instrumentalness tempo danceability liveness valence duration_ms loudness
## 416 0 135.658 0.392 0.12 0.612 252280 -5.299
## cluster
## 416 1
# cari 10 lagu yang juga termasuk cluster 1 yang merupakan lagu yang dapat direkomendasikan oleh spotify kepada pendengar
data.frame(head(spot_last[spot_last$cluster == 1,]$track_name,10))
## head.spot_last.spot_last.cluster....1....track_name..10.
## 1 '39 - Remastered 2011
## 2 '64 aka Go
## 3 '75 aka Stay With You
## 4 '79 aka The Shouty Track
## 5 '84
## 6 '85
## 7 '87
## 8 '88 aka Come Down On Me
## 9 '95 aka Make Things Right
## 10 '97 Bonnie & Clyde