Clustering lagu (track_name) berdasarkan

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.6.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.6.2

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v stringr 1.4.0
## v tidyr   1.0.0     v forcats 0.4.0
## v readr   1.3.1

## Warning: package 'ggplot2' was built under R version 3.6.2

## Warning: package 'tibble' was built under R version 3.6.2

## Warning: package 'tidyr' was built under R version 3.6.2

## Warning: package 'readr' was built under R version 3.6.2

## Warning: package 'purrr' was built under R version 3.6.2

## Warning: package 'stringr' was built under R version 3.6.2

## Warning: package 'forcats' was built under R version 3.6.2

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Proses preprocessing data berupa cleansing dan eksplorasi data analisis

Read Data

spotify<- read.csv("SpotifyFeatures.csv")
head(spotify)

##   ï..genre        artist_name                       track_name
## 1    Movie     Henri Salvador      C'est beau de faire un Show
## 2    Movie Martin & les fÃ©es Perdu d'avance (par Gad Elmaleh)
## 3    Movie    Joseph Williams   Don't Let Me Be Lonely Tonight
## 4    Movie     Henri Salvador   Dis-moi Monsieur Gordon Cooper
## 5    Movie       Fabien Nataf                        Ouverture
## 6    Movie     Henri Salvador   Le petit souper aux chandelles
##                 track_id popularity acousticness danceability duration_ms
## 1 0BRjO6ga9RKCKjfDqeFgWV          0        0.611        0.389       99373
## 2 0BjC1NfoEOOusryehmNudP          1        0.246        0.590      137373
## 3 0CoSDzoNIKCRs124s9uTVy          3        0.952        0.663      170267
## 4 0Gc6TVm52BwZD07Ki6tIvf          0        0.703        0.240      152427
## 5 0IuslXpMROHdEPvSl1fTQK          4        0.950        0.331       82625
## 6 0Mf1jKa8eNAf1a4PwTbizj          0        0.749        0.578      160627
##   energy instrumentalness key liveness loudness  mode speechiness   tempo
## 1 0.9100            0.000  C#   0.3460   -1.828 Major      0.0525 166.969
## 2 0.7370            0.000  F#   0.1510   -5.559 Minor      0.0868 174.003
## 3 0.1310            0.000   C   0.1030  -13.879 Minor      0.0362  99.488
## 4 0.3260            0.000  C#   0.0985  -12.178 Major      0.0395 171.758
## 5 0.2250            0.123   F   0.2020  -21.150 Major      0.0456 140.576
## 6 0.0948            0.000  C#   0.1070  -14.970 Major      0.1430  87.479
##   time_signature valence
## 1            4/4   0.814
## 2            4/4   0.816
## 3            5/4   0.368
## 4            4/4   0.227
## 5            4/4   0.390
## 6            4/4   0.358

glimpse(spotify)

## Observations: 232,725
## Variables: 18
## $ ï..genre         <fct> Movie, Movie, Movie, Movie, Movie, Movie, Movie, M...
## $ artist_name      <fct> Henri Salvador, Martin & les fÃ©es, Joseph William...
## $ track_name       <fct> "C'est beau de faire un Show", "Perdu d'avance (pa...
## $ track_id         <fct> 0BRjO6ga9RKCKjfDqeFgWV, 0BjC1NfoEOOusryehmNudP, 0C...
## $ popularity       <int> 0, 1, 3, 0, 4, 0, 2, 15, 0, 10, 0, 2, 4, 3, 0, 0, ...
## $ acousticness     <dbl> 0.61100, 0.24600, 0.95200, 0.70300, 0.95000, 0.749...
## $ danceability     <dbl> 0.389, 0.590, 0.663, 0.240, 0.331, 0.578, 0.703, 0...
## $ duration_ms      <int> 99373, 137373, 170267, 152427, 82625, 160627, 2122...
## $ energy           <dbl> 0.9100, 0.7370, 0.1310, 0.3260, 0.2250, 0.0948, 0....
## $ instrumentalness <dbl> 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.23e-01, ...
## $ key              <fct> C#, F#, C, C#, F, C#, C#, F#, C, G, E, C, F#, D#, ...
## $ liveness         <dbl> 0.3460, 0.1510, 0.1030, 0.0985, 0.2020, 0.1070, 0....
## $ loudness         <dbl> -1.828, -5.559, -13.879, -12.178, -21.150, -14.970...
## $ mode             <fct> Major, Minor, Minor, Major, Major, Major, Major, M...
## $ speechiness      <dbl> 0.0525, 0.0868, 0.0362, 0.0395, 0.0456, 0.1430, 0....
## $ tempo            <dbl> 166.969, 174.003, 99.488, 171.758, 140.576, 87.479...
## $ time_signature   <fct> 4/4, 4/4, 5/4, 4/4, 4/4, 4/4, 4/4, 4/4, 4/4, 4/4, ...
## $ valence          <dbl> 0.8140, 0.8160, 0.3680, 0.2270, 0.3900, 0.3580, 0....

Data Preparation

# Membuang Variabel yang tidak perlu dan mengambil hanya data numeric agar siap diproses
spot_leveled<- spotify %>% 
  mutate(popularity=as.numeric(popularity))%>% 
  group_by(track_name) %>% 
  summarise(energy=mean(energy),
            popularity=mean(popularity),
            speechiness=mean(speechiness),
            acousticness=mean(acousticness),
            instrumentalness=mean(instrumentalness),
            tempo=mean(tempo),
            danceability=mean(danceability),
            liveness=mean(liveness),
            valence=mean(valence),
            duration_ms=mean(duration_ms),
            loudness=mean(loudness)
            )

spot_cleaner<- spot_leveled %>% 
  select(-track_name)

EDA & Scalling

# EDA
GGally::ggcorr(spot_cleaner,label = T)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

Berdasarkan gambar plot di atas dapat disimpulkan fitur audio yang paling berpengaruh dalam popularitas sebuah lagu adala“h”acousticness"

# Melakukan Scalling
sptify_z <- scale(spot_cleaner, center = T, scale = T)

K- Means

Memilih K secara Random

# k-means with 3 clusters
set.seed(100)
sptify_km <- kmeans(sptify_z, 3)

Hasil k-means 3 cluster:

sptify_km$size # jumlah data tiap cluster

## [1]  38594 100762   9259

sptify_km$centers # letak pusat cluster atau centroid

##       energy popularity speechiness acousticness instrumentalness      tempo
## 1 -1.2852464 -0.5529941  -0.3809466    1.1631236        0.9275222 -0.3881391
## 2  0.4572420  0.2935443  -0.1755807   -0.5388577       -0.3045946  0.2037304
## 3  0.3812705 -0.8894974   3.4986631    1.0159610       -0.5513799 -0.5992489
##   danceability   liveness    valence duration_ms   loudness
## 1   -1.0165082 -0.2830985 -0.9064615  0.07717412 -1.2455042
## 2    0.3761734 -0.1042406  0.3574360 -0.03505573  0.4996075
## 3    0.1433347  2.3144391 -0.1114582  0.05981505 -0.2454322

sptify_km$iter # berapa kali pengulangan sampai menghasilkan kelompok yang stabil

## [1] 3

Pemilihan `K` Optimum

Pemilihan K menggunakan “Elbow Method”

# Fungsi manual untuk elbow plot

wss <- function(data, maxCluster = 9) {
    # Initialize within sum of squares
    RNGkind(sample.kind = "Rounding")
    set.seed(50)
    SSw <- (nrow(data) - 1) * sum(apply(data, 2, var))
    SSw <- vector()
    for (i in 2:maxCluster) {
        SSw[i] <- sum(kmeans(data, centers = i)$withinss)
    }
    plot(1:maxCluster, SSw, type = "o", xlab = "Number of Clusters", ylab = "Within groups sum of squares", pch=19)
}

wss(sptify_z)

## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 7430750)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 7430750)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 7430750)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 7430750)

Ambil k dimana ketika k ditambah, penurunan nilai total within sum of squares tidak lagi besar (menunjukkan kemiringan yang landai)

k = 4

RNGkind(sample.kind = "Rounding")

## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used

set.seed(100)
sptify_km2 <- kmeans(sptify_z, 4)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 7430750)

Goodness of Fit

Berikut ini kita akan membandingkan pemilihan nilai cluster secara random (=3) dan menggunakan “Elbow Method” (=4)

Kebaikan hasil clustering dapat dilihat dari 3 nilai:

Within Sum of Squares ($withinss): jarak tiap observasi ke centroid untuk tiap cluster -> dikuadratkan -> dijumlahkan
Total Sum of Squares ($totss): jarak tiap observasi ke global sample mean (rata-rata data keseluruhan).
Between Sum of Squares ($betweenss): jarak centroid tiap cluster ke global sample mean -> dikuadratkan -> dikali jumlah data tiap cluster -> dijumlahkan

# Nilai cluster = 3
sptify_km$withinss

## [1] 319702.17 602536.05  88005.67

sptify_km$betweenss

## [1] 624510.1

sptify_km$totss

## [1] 1634754

# betweenss/totss
sptify_km$betweenss/sptify_km$totss

## [1] 0.3820208

# Nilai cluster = 4
sptify_km2$withinss

## [1] 283812.66 305915.52 258827.68  87936.63

sptify_km2$betweenss

## [1] 698261.5

sptify_km2$totss

## [1] 1634754

# betweenss/totss
sptify_km2$betweenss/sptify_km2$totss

## [1] 0.4271355

Clustering yang baik:

withinss semakin rendah
betweenss/totss mendekati 1

Berdasarkan perhitungan goodness of fit di atas dan juga berdasarkan “Elbow Method” nilai cluster K = 4 merupakan nilai terbaik karena memiliki “withinss” yang lebih rendah dan “betweenss/totss” yang lebih besar

Interpretasi/Cluster Profiling

Berikut kita akan mencoba menginterpretasikan hasil clustering.

Mengetahu karakteristik setiap cluster

# memasukkan hasil clustering ke data clean
spot_cleaner$cluster <- sptify_km2$cluster 

# Mengelompokkan karakteristik berdasarkan cluster 

spot_cleaner %>% 
  group_by(cluster) %>% 
  summarise_all(mean)

## # A tibble: 4 x 12
##   cluster energy popularity speechiness acousticness instrumentalness tempo
##     <int>  <dbl>      <dbl>       <dbl>        <dbl>            <dbl> <dbl>
## 1       1  0.703       44.8      0.0909        0.166          0.0890  131. 
## 2       2  0.191       26.0      0.0512        0.847          0.493   104. 
## 3       3  0.643       36.3      0.0982        0.288          0.0816  115. 
## 4       4  0.658       20.4      0.861         0.789          0.00241  98.4
## # ... with 5 more variables: danceability <dbl>, liveness <dbl>, valence <dbl>,
## #   duration_ms <dbl>, loudness <dbl>

Karakteristik setiap cluster

*1= high popularity,high energy,low speechiness,low acousticness, high tempo, high loudness

*2= low energy,high acousticness,high instrumentalness, low danceability, low liveness, low valence,high duration_ms

*3= high danceability, high valence,low duration_ms

*4= low popularity, high energy,high speechiness, low instrumentalness, low tempo, high liveness

Mengajukan pertanyaan business: Jika seseorang memutar lagu “(I’m Gonna Be) 500 Miles” maka lagu apakah yang dapat direkomendasikan berikutnya oleh aplikasi spotify untuk dapat diputar oleh pendengar?

Berikut adalah step untuk jawaban pertanyaan di atas:

# Memasukkan kembali varable track_name ke data clean yaang sudah memiliki cluster

spot_last<- data.frame(cbind(track_name=spot_leveled$track_name,spot_cleaner))

# Mencari cluster yang memiliki lagu dengan karakteristik yang sama

spot_last[spot_last$track_name == "(I'm Gonna Be) 500 Miles",]

##                   track_name energy popularity speechiness acousticness
## 416 (I'm Gonna Be) 500 Miles  0.854         47       0.236      0.00169
##     instrumentalness   tempo danceability liveness valence duration_ms loudness
## 416                0 135.658        0.392     0.12   0.612      252280   -5.299
##     cluster
## 416       1

# cari 10 lagu yang juga termasuk cluster 1 yang merupakan lagu yang dapat direkomendasikan oleh spotify kepada pendengar
data.frame(head(spot_last[spot_last$cluster == 1,]$track_name,10))

##    head.spot_last.spot_last.cluster....1....track_name..10.
## 1                                     '39 - Remastered 2011
## 2                                                '64 aka Go
## 3                                     '75 aka Stay With You
## 4                                  '79 aka The Shouty Track
## 5                                                       '84
## 6                                                       '85
## 7                                                       '87
## 8                                   '88 aka Come Down On Me
## 9                                 '95 aka Make Things Right
## 10                                       '97 Bonnie & Clyde

LBB Unsupervised Learning

Roji

4/7/2020

Proses preprocessing data berupa cleansing dan eksplorasi data analisis

K- Means

Pemilihan `K` Optimum

Goodness of Fit

Interpretasi/Cluster Profiling

LBB Unsupervised Learning

Roji

4/7/2020

Proses preprocessing data berupa cleansing dan eksplorasi data analisis

K- Means

Pemilihan K Optimum

Goodness of Fit

Interpretasi/Cluster Profiling

Pemilihan `K` Optimum