This is an unsupervised learning analysis of spotify tracks dataset which can be downloaded at:
https://www.kaggle.com/datasets/zaheenhamidani/ultimate-spotify-tracks-db
The dataset is composed of tracks characterized by the following musical measures :
popularity, acousticness, danceability, duration_ms, energy, instrumentalness, key, liveness, loudnes, mode, speechiness, tempo, time_signature, valence.
The dataset have no target label variable and so are classified using Principal Component Analysis and then visualized using K-Means clustering.
Invoke the required libraries.
library(tm)
library(tidyverse)
library(lubridate)
library(cluster)
library(ggforce)
library(GGally)
library(scales)
library(cowplot)
library(plotly)
library(FactoMineR)
library(factoextra)
library(dplyr)
library(ggcorrplot)
options(scipen = 100, max.print = 101)
Import the dataset and use glimpse() to view the data fields’ information.
data <- read.csv("SpotifyFeatures.csv")
glimpse(data)
## Rows: 232,725
## Columns: 20
## $ genre <chr> "Movie", "Movie", "Movie", "Movie", "Movie", "Movie",…
## $ artist_name <chr> "Henri Salvador", "Martin & les fées", "Joseph Willia…
## $ track_name <chr> "C'est beau de faire un Show", "Perdu d'avance (par G…
## $ track_id <chr> "0BRjO6ga9RKCKjfDqeFgWV", "0BjC1NfoEOOusryehmNudP", "…
## $ popularity <chr> "0", "1", "3", "0", "4", "0", "2", "15", "0", "10", "…
## $ acousticness <chr> "0.611", "0.246", "0.952", "0.703", "0.95", "0.749", …
## $ danceability <dbl> 0.389, 0.590, 0.663, 0.240, 0.331, 0.578, 0.703, 0.41…
## $ duration_ms <dbl> 99373, 137373, 170267, 152427, 82625, 160627, 212293,…
## $ energy <dbl> 0.9100, 0.7370, 0.1310, 0.3260, 0.2250, 0.0948, 0.270…
## $ instrumentalness <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.123…
## $ key <chr> "C#", "F#", "C", "C#", "F", "C#", "C#", "F#", "C", "G…
## $ liveness <chr> "0.346", "0.151", "0.103", "0.0985", "0.202", "0.107"…
## $ loudness <chr> "-1.828", "-5.559", "-13.879", "-12.178", "-21.15", "…
## $ mode <chr> "Major", "Minor", "Minor", "Major", "Major", "Major",…
## $ speechiness <chr> "0.0525", "0.0868", "0.0362", "0.0395", "0.0456", "0.…
## $ tempo <chr> "166.969", "174.003", "99.488", "171.758", "140.576",…
## $ time_signature <chr> "4/4", "4/4", "5/4", "4/4", "4/4", "4/4", "4/4", "4/4…
## $ valence <chr> "0.814", "0.816", "0.368", "0.227", "0.39", "0.358", …
## $ X <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ X.1 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
Check the unique values of data$x.
unique(data$x)
## NULL
Check the unique values of data$x.1.
unique(data$x.1)
## NULL
Check the unique values of instrumentalness.
unique(data$instrumentalness)
## [1] 0.00000000 0.12300000 0.00086000 0.00125000 0.52900000 0.88700000
## [7] 0.07290000 0.93300000 0.00000725 0.04220000 0.00000813 0.00032800
## [13] 0.00110000 0.00200000 0.00000127 0.91900000 0.00000832 0.97600000
## [19] 0.00004900 0.00073200 0.00068300 0.00002940 0.06080000 0.89100000
## [25] 0.04180000 0.02860000 0.00001660 0.01170000 0.00002440 0.01060000
## [31] 0.00018400 0.00008630 0.00000900 0.00016900 0.00445000 0.85500000
## [37] 0.00004810 0.00038100 0.00000603 0.00003550 0.00002950 0.00000316
## [43] 0.00007440 0.33100000 0.95900000 0.00007550 0.00000805 0.00000295
## [49] 0.86100000 0.03270000 0.22900000 0.00000655 0.00000140 0.13900000
## [55] 0.32700000 0.00051200 0.00061400 0.88100000 0.78500000 0.00000859
## [61] 0.92600000 0.14500000 0.00000503 0.00000395 0.00243000 0.00013000
## [67] 0.00020800 0.00000833 0.00000807 0.01820000 0.00000927 0.00013100
## [73] 0.00006590 0.00050100 0.09980000 0.00000761 0.00000141 0.00012700
## [79] 0.01610000 0.00001140 0.00000515 0.00000210 0.00000378 0.25400000
## [85] 0.00020000 0.01840000 0.00002880 0.00000588 0.00001780 0.00018900
## [91] 0.00001190 0.00132000 0.00000865 0.00016100 0.00005070 0.00003030
## [97] 0.00000108 0.00452000 0.00000139 0.01260000 0.00131000
## [ reached getOption("max.print") -- omitted 5309 entries ]
Check the unique values of acousticness.
unique(data$acousticness)
## [1] "0.611" "0.246" "0.952" "0.703" "0.95" "0.749" "0.344"
## [8] "0.939" "0.00104" "0.319" "0.921" "0.0383" "0.215" "0.958"
## [15] "0.97" "0.548" "0.7" "0.488" "0.381" "0.161" "0.852"
## [22] "0.513" "0.689" "0.669" "0.706" "0.882" "0.159" "0.864"
## [29] "0.716" "0.184" "0.00323" "0.305" "0.922" "0.942" "0.123"
## [36] "0.767" "0.164" "0.619" "0.932" "0.508" "0.649" "0.983"
## [43] "0.934" "0.576" "0.751" "0.614" "0.0287" "0.285" "0.659"
## [50] "0.581" "0.902" "0.386" "0.712" "0.917" "0.924" "0.967"
## [57] "0.989" "0.728" "0.871" "0.126" "0.929" "0.455" "0.733"
## [64] "0.71" "0.0575" "0.667" "0.791" "0.334" "0.933" "0.48"
## [71] "0.615" "0.979" "0.695" "0.756" "0.91" "0.776" "0.715"
## [78] "0.778" "0.398" "0.761" "0.616" "0.962" "0.525" "0.876"
## [85] "0.847" "0.66" "0.366" "0.357" "0.813" "0.345" "0.79"
## [92] "0.0367" "0.744" "0" "0.844" "0.987" "0.414" "0.636"
## [99] "0.951" "0.974" "0.523"
## [ reached getOption("max.print") -- omitted 4686 entries ]
Check the unique values of popularity.
unique(data$popularity)
## [1] "0" "1" "3"
## [4] "4" "2" "15"
## [7] "10" "8" "5"
## [10] "6" "7" "11"
## [13] "3NXlNZSmjO3DsJ3DQuyU8e" "65" "63"
## [16] "62" "61" "68"
## [19] "64" "66" "60"
## [22] "69" "71" "76"
## [25] "67" "70" "72"
## [28] "57" "59" "56"
## [31] "28" "31" "74"
## [34] "55" "53" "9"
## [37] "13" "23" "15hzCoAhSiWk8juLz5Jwut"
## [40] "12" "4Er5ftgKkMeKg07YPLVOd6" "7Ib5UBlT8wh2RuwbMkKRb5"
## [43] "44" "33" "25"
## [46] "26" "24" "22"
## [49] "20" "19" "18"
## [52] "16" "17" "14"
## [55] "83" "81" "73"
## [58] "78" "77" "75"
## [61] "45" "42" "46"
## [64] "54" "41" "52"
## [67] "58" "51" "43"
## [70] "47" "48" "40"
## [73] "50" "49" "39"
## [76] "80" "37" "35"
## [79] "21" "38" "36"
## [82] "29" "7gNbidsHK16wvnuK2VgaVc" "34"
## [85] "32" "99" "100"
## [88] "97" "92" "91"
## [91] "95" "90" "93"
## [94] "88" "87" "89"
## [97] "96" "86" "85"
## [100] "84" "94"
## [ reached getOption("max.print") -- omitted 103 entries ]
Find non numeric data values in popularity and drop rows containing such values.
data <- transform(data[grep("^\\d+$", data$popularity),,drop=F], A= as.numeric(as.character(popularity)))
Check the unique values of popularity.
unique(data$popularity)
## [1] "0" "1" "3" "4" "2" "15" "10" "8" "5" "6" "7" "11"
## [13] "65" "63" "62" "61" "68" "64" "66" "60" "69" "71" "76" "67"
## [25] "70" "72" "57" "59" "56" "28" "31" "74" "55" "53" "9" "13"
## [37] "23" "12" "44" "33" "25" "26" "24" "22" "20" "19" "18" "16"
## [49] "17" "14" "83" "81" "73" "78" "77" "75" "45" "42" "46" "54"
## [61] "41" "52" "58" "51" "43" "47" "48" "40" "50" "49" "39" "80"
## [73] "37" "35" "21" "38" "36" "29" "34" "32" "99" "100" "97" "92"
## [85] "91" "95" "90" "93" "88" "87" "89" "96" "86" "85" "84" "94"
## [97] "82" "79" "27" "30" "98"
Check numeric values of new data$A.
unique(data$A)
## [1] 0 1 3 4 2 15 10 8 5 6 7 11 65 63 62 61 68 64
## [19] 66 60 69 71 76 67 70 72 57 59 56 28 31 74 55 53 9 13
## [37] 23 12 44 33 25 26 24 22 20 19 18 16 17 14 83 81 73 78
## [55] 77 75 45 42 46 54 41 52 58 51 43 47 48 40 50 49 39 80
## [73] 37 35 21 38 36 29 34 32 99 100 97 92 91 95 90 93 88 87
## [91] 89 96 86 85 84 94 82 79 27 30 98
Convert key, mode and time_signature to factor type.
data<-data %>%
mutate(across(c(key,
mode,time_signature),
factor))
Convert key, acousticness, liveness, popularity, loudness, speechiness, tempo, and valence to numeric type.
data <- data %>% mutate_at(c('key','acousticness', 'liveness','popularity','loudness','speechiness','tempo', 'valence'), as.numeric)
View structure of transformed data.
glimpse(data)
## Rows: 232,603
## Columns: 21
## $ genre <chr> "Movie", "Movie", "Movie", "Movie", "Movie", "Movie",…
## $ artist_name <chr> "Henri Salvador", "Martin & les fées", "Joseph Willia…
## $ track_name <chr> "C'est beau de faire un Show", "Perdu d'avance (par G…
## $ track_id <chr> "0BRjO6ga9RKCKjfDqeFgWV", "0BjC1NfoEOOusryehmNudP", "…
## $ popularity <dbl> 0, 1, 3, 0, 4, 0, 2, 15, 0, 10, 0, 2, 4, 3, 0, 0, 0, …
## $ acousticness <dbl> 0.61100, 0.24600, 0.95200, 0.70300, 0.95000, 0.74900,…
## $ danceability <dbl> 0.389, 0.590, 0.663, 0.240, 0.331, 0.578, 0.703, 0.41…
## $ duration_ms <dbl> 99373, 137373, 170267, 152427, 82625, 160627, 212293,…
## $ energy <dbl> 0.9100, 0.7370, 0.1310, 0.3260, 0.2250, 0.0948, 0.270…
## $ instrumentalness <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.123…
## $ key <dbl> 5, 10, 4, 5, 9, 5, 5, 10, 4, 11, 8, 4, 10, 7, 11, 12,…
## $ liveness <dbl> 0.3460, 0.1510, 0.1030, 0.0985, 0.2020, 0.1070, 0.105…
## $ loudness <dbl> -1.828, -5.559, -13.879, -12.178, -21.150, -14.970, -…
## $ mode <fct> Major, Minor, Minor, Major, Major, Major, Major, Majo…
## $ speechiness <dbl> 0.0525, 0.0868, 0.0362, 0.0395, 0.0456, 0.1430, 0.953…
## $ tempo <dbl> 166.969, 174.003, 99.488, 171.758, 140.576, 87.479, 8…
## $ time_signature <fct> 4/4, 4/4, 5/4, 4/4, 4/4, 4/4, 4/4, 4/4, 4/4, 4/4, 4/4…
## $ valence <dbl> 0.8140, 0.8160, 0.3680, 0.2270, 0.3900, 0.3580, 0.533…
## $ X <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ X.1 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ A <dbl> 0, 1, 3, 0, 4, 0, 2, 15, 0, 10, 0, 2, 4, 3, 0, 0, 0, …
Check for missing values.
colSums(is.na(data))
## genre artist_name track_name track_id
## 0 0 0 0
## popularity acousticness danceability duration_ms
## 0 0 0 0
## energy instrumentalness key liveness
## 0 0 0 0
## loudness mode speechiness tempo
## 0 0 0 0
## time_signature valence X X.1
## 0 0 0 232603
## A
## 0
Remove non meaningful columns.
data_cleaned <- data %>% select(-c(X, X.1, A, track_id))
Check number of rows and display the transformed and cleaned data.
nrow(data_cleaned)
## [1] 232603
rmarkdown::paged_table(data_cleaned )
We drop columns with non meaningful values for Principle Component Analysis clusterization. These are artist_name, track_name, genre, mode, time_signature; and convert “key” column into factor type.
spotify_reduced <- data_cleaned %>% select(-c(artist_name, track_name, genre, mode, time_signature))
spotify_reduced$key <- as.factor(data_cleaned$key)
Scale the chosen data columns.
spotify_reduced.scaled <- scale(spotify_reduced[, c('energy', 'liveness', 'tempo', 'speechiness' , 'acousticness', 'instrumentalness', 'danceability' , 'duration_ms' ,'loudness', 'valence')])
View the scaled data.
spotify_reduced.scaled
## energy liveness tempo speechiness acousticness
## 1 1.28677820573 0.66067044236 1.59553078739 -0.368026641 0.683600143
## 2 0.63011056975 -0.32280635876 1.82317135659 -0.183147727 -0.345290172
## 3 -1.67012403954 -0.56489295596 -0.58834940172 -0.455884551 1.644837396
## 4 -0.92994953655 -0.58758857445 1.75051666824 -0.438097367 0.942936880
## 5 -1.31332197143 -0.06558934924 0.74137702809 -0.405218026 1.639199641
## 6 -1.80753079343 -0.54471907286 -0.97699534934 0.119773409 1.072605248
## 7 -1.14251247074 -0.55480601441 -1.12605882129 4.485718606 -0.069040170
## 8 -1.14630823742 -0.51445824821 -0.67446705237 -0.496848975 1.608191987
## 9 -0.34160570084 -0.69854493150 0.23988168258 -0.403062004 -1.035802423
## 10 0.50864603592 0.67580085469 0.64169932648 -0.499544003 -0.139512109
## instrumentalness danceability duration_ms loudness
## 1 -0.489868378 -0.890999212 -1.14294225406 1.290537266908
## 2 -0.489868378 0.191875596 -0.82295591648 0.668520702628
## 3 -0.489868378 0.585158487 -0.54596563785 -0.718554562664
## 4 -0.489868378 -1.693727303 -0.69619080264 -0.434971025854
## 5 -0.083649263 -1.203470549 -1.28397202200 -1.930744977801
## 6 -0.489868378 0.127226354 -0.62714111927 -0.900441475457
## 7 -0.489868378 0.800655961 -0.19207758987 -0.517828767062
## 8 -0.489868378 -0.745538417 0.04179874012 0.103354218332
## 9 -0.487028147 0.967666503 -0.07497101101 0.307414329476
## 10 -0.485740135 0.234975091 -0.69394247759 0.296577803966
## valence
## 1 1.3809857490
## 2 1.3886766187
## 3 -0.3340781846
## 4 -0.8762844954
## 5 -0.2494786183
## 6 -0.3725325329
## 7 0.3004185622
## 8 -0.6955490585
## 9 1.1925594424
## 10 1.0118240055
## [ reached getOption("max.print") -- omitted 232593 rows ]
## attr(,"scaled:center")
## energy liveness tempo speechiness
## 0.5709965 0.2150048 117.6677546 0.1207788
## acousticness instrumentalness danceability duration_ms
## 0.3684921 0.1483283 0.5543846 235103.1876803
## loudness valence
## -9.5689426 0.4548766
## attr(,"scaled:scale")
## energy liveness tempo speechiness
## 0.2634514 0.1982762 30.8995889 0.1855268
## acousticness instrumentalness danceability duration_ms
## 0.3547511 0.3027923 0.1856170 118755.0702572
## loudness valence
## 5.9982325 0.2600486
Check for correlation between columns in dataset using cor().
corr_matrix <- cor(spotify_reduced.scaled)
ggcorrplot(corr_matrix)
We will use princomp() and prcomp() to generate new dimensions to describe the dataset.
Check importance of each newly computed components using princomp().
data.pca <- princomp(corr_matrix)
summary(data.pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.0334851 0.4790364 0.36319775 0.30096532 0.23594290
## Proportion of Variance 0.6501807 0.1396891 0.08029933 0.05513895 0.03388749
## Cumulative Proportion 0.6501807 0.7898698 0.87016915 0.92530810 0.95919559
## Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
## Standard deviation 0.16690440 0.14901480 0.124957342 0.0368111108 0
## Proportion of Variance 0.01695748 0.01351713 0.009504935 0.0008248661 0
## Cumulative Proportion 0.97615307 0.98967020 0.999175134 1.0000000000 1
From the above process, we notice that 10 principal components have been generated (Comp.1 to Comp.10).
In the Cumulative Proportion section, the first principal component explains almost 65% of the total variance.
Check the loadings of the original data columns in these computed components.
data.pca$loadings[, 1:10]
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## energy 0.449644805 0.04786607 0.279328068 0.02257737 0.31478102
## liveness -0.005136535 0.59660419 0.283635807 0.14279390 0.14268476
## tempo 0.146381788 -0.30077103 0.229791191 0.67724938 -0.49283877
## speechiness -0.008476668 0.64207534 0.035882341 0.06415608 -0.11204161
## acousticness -0.447070449 0.17618493 -0.246276321 0.06311711 -0.25705228
## instrumentalness -0.350406265 -0.28870737 -0.009199244 0.05068814 0.59923333
## danceability 0.329996145 0.05870279 -0.490869353 -0.26034338 -0.16405562
## duration_ms -0.100046536 -0.13924179 0.510883549 -0.66017224 -0.39099361
## loudness 0.476212127 -0.03522546 0.181103843 -0.04347133 0.12774580
## valence 0.328561091 -0.02408548 -0.438115336 -0.06533427 -0.06200723
## Comp.6 Comp.7 Comp.8 Comp.9
## energy 0.0095242526 0.005779868 0.2636812874 0.67995478
## liveness -0.4925077364 -0.087449578 -0.5100175147 -0.06092720
## tempo -0.0004688077 -0.297290639 -0.0849707459 -0.01288403
## speechiness 0.4296343651 -0.400505024 0.4366140774 -0.18384812
## acousticness -0.0854219621 0.372514068 0.1424843321 0.15562320
## instrumentalness 0.0476624965 -0.493676127 -0.0254341232 -0.17879015
## danceability 0.2997240167 -0.281325644 -0.5391899815 0.15977990
## duration_ms -0.1336673784 -0.253536172 0.0387082601 -0.02449449
## loudness 0.1642318553 0.386369173 0.0008389973 -0.62182042
## valence -0.6546123502 -0.261427754 0.3991602237 -0.17609233
## Comp.10
## energy 0.29311593
## liveness 0.09379477
## tempo 0.19488565
## speechiness 0.01698776
## acousticness 0.66917203
## instrumentalness 0.39182696
## danceability 0.25841212
## duration_ms 0.19144493
## loudness 0.39756666
## valence 0.06488294
Display a scree plot from data.pca to see the importance of each component.
fviz_eig(data.pca, addlabels = TRUE)
The scree plot above show that the first component explains 65% of the variance in the dataset. Plot a graph to view loadings of original variables on the new components. As this can only be visualized in two dimensions we can only see the loadings for component 1 and 2.
# Graph of the variables
fviz_pca_var(data.pca, col.var = "black")
Check to view how much of each original variable is represented in dimensions 1 and 2.
fviz_cos2(data.pca, choice = "var", axes = 1:2)
fviz_pca_var(data.pca, col.var = "cos2",
gradient.cols = c("black", "orange", "green"),
repel = TRUE)
View the same information as above but for all dimensions in a tabular format.
# Eigenvalues
eig.val <- get_eigenvalue(data.pca)
eig.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 1.068091524 65.01807063 65.01807
## Dim.2 0.229475833 13.96891146 78.98698
## Dim.3 0.131912609 8.02993295 87.01692
## Dim.4 0.090580122 5.51389526 92.53081
## Dim.5 0.055669050 3.38874915 95.91956
## Dim.6 0.027857080 1.69574755 97.61531
## Dim.7 0.022205412 1.35171285 98.96702
## Dim.8 0.015614337 0.95049354 99.91751
## Dim.9 0.001355058 0.08248661 100.00000
## Dim.10 0.000000000 0.00000000 100.00000
# Results for Variables
res.var <- get_pca_var(data.pca)
res.var$coord # Coordinates
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## energy 0.464701222 0.02292959 0.101451327 0.006795006 0.07427034
## liveness -0.005308532 0.28579510 0.103015888 0.042976011 0.03366545
## tempo 0.151283402 -0.14408026 0.083459645 0.203828575 -0.11628181
## speechiness -0.008760510 0.30757744 0.013032386 0.019308754 -0.02643542
## acousticness -0.462040663 0.08439899 -0.089447007 0.018996060 -0.06064966
## instrumentalness -0.362139667 -0.13830133 -0.003341145 0.015255373 0.14138485
## danceability 0.341046111 0.02812077 -0.178282647 -0.078354327 -0.03870776
## duration_ms -0.103396608 -0.06670188 0.185551758 -0.198688947 -0.09225216
## loudness 0.492158155 -0.01687428 0.065776509 -0.013083363 0.03014071
## valence 0.339563004 -0.01153782 -0.159122506 -0.019663351 -0.01463016
## Dim.6 Dim.7 Dim.8 Dim.9
## energy 0.00158963969 0.0008612859 0.0329489129 0.0250298909
## liveness -0.08220170994 -0.0130312818 -0.0637304331 -0.0022427978
## tempo -0.00007824606 -0.0443007063 -0.0106177186 -0.0004742756
## speechiness 0.07170786742 -0.0596811776 0.0545581347 -0.0067676535
## acousticness -0.01425730162 0.0555101109 0.0178044634 0.0057286630
## instrumentalness 0.00795508055 -0.0735650514 -0.0031781804 -0.0065814639
## danceability 0.05002525821 -0.0419216857 -0.0673757470 0.0058816755
## duration_ms -0.02230967406 -0.0377806430 0.0048368813 -0.0009016694
## loudness 0.02741101983 0.0575747266 0.0001048389 -0.0228899005
## valence -0.10925768382 -0.0389566055 0.0498780007 -0.0064821542
## Dim.10
## energy 0
## liveness 0
## tempo 0
## speechiness 0
## acousticness 0
## instrumentalness 0
## danceability 0
## duration_ms 0
## loudness 0
## valence 0
res.var$contrib # Contributions to the PCs
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## energy 20.218045069 0.22911607 7.802416950 0.05097377 9.9087088
## liveness 0.002638399 35.59365613 8.044927097 2.03900975 2.0358940
## tempo 2.142762774 9.04632134 5.280399140 45.86667251 24.2890057
## speechiness 0.007185389 41.22607485 0.128754243 0.41160024 1.2553323
## acousticness 19.987198643 3.10411307 6.065202651 0.39837692 6.6075877
## instrumentalness 12.278455087 8.33519470 0.008462608 0.25692877 35.9080580
## danceability 10.889745598 0.34460181 24.095272199 6.77786745 2.6914248
## duration_ms 1.000930932 1.93882772 26.100200094 43.58273817 15.2876002
## loudness 22.677799031 0.12408330 3.279860212 0.18897567 1.6318990
## valence 10.795239077 0.05801102 19.194504806 0.42685674 0.3844896
## Dim.6 Dim.7 Dim.8 Dim.9 Dim.10
## energy 0.00907113871 0.003340688 6.95278213231 46.23385075 NaN
## liveness 24.25638703706 0.764742877 26.01178653443 0.37121233 NaN
## tempo 0.00002197806 8.838172401 0.72200276519 0.01659983 NaN
## speechiness 18.45856876904 16.040427389 19.06318525778 3.38001312 NaN
## acousticness 0.72969116007 13.876673104 2.03017848825 2.42185812 NaN
## instrumentalness 0.22717135725 24.371611861 0.06468946210 3.19659164 NaN
## danceability 8.98344861934 7.914411791 29.07258361037 2.55296156 NaN
## duration_ms 1.78669680504 6.428059063 0.14983294023 0.05999801 NaN
## loudness 2.69721022834 14.928113788 0.00007039164 38.66606387 NaN
## valence 42.85173290708 6.834447039 15.93288841770 3.10085077 NaN
res.var$cos2 # Quality of representation
## Dim.1 Dim.2 Dim.3 Dim.4
## energy 0.21594722573 0.0005257660 0.01029237176 0.00004617211
## liveness 0.00002818051 0.0816788389 0.01061227322 0.00184693753
## tempo 0.02288666758 0.0207591213 0.00696551227 0.04154608809
## speechiness 0.00007674653 0.0946038788 0.00016984308 0.00037282800
## acousticness 0.21348157462 0.0071231893 0.00800076705 0.00036085030
## instrumentalness 0.13114513808 0.0191272575 0.00001116325 0.00023272640
## danceability 0.11631244973 0.0007907779 0.03178470219 0.00613940063
## duration_ms 0.01069085845 0.0044491411 0.03442945488 0.03947729757
## loudness 0.24221964931 0.0002847412 0.00432654918 0.00017117439
## valence 0.11530303360 0.0001331213 0.02531997206 0.00038664736
## Dim.5 Dim.6 Dim.7
## energy 0.0055160841 0.000002526954357 0.0000007418134
## liveness 0.0011333628 0.006757121116775 0.0001698143054
## tempo 0.0135214588 0.000000006122446 0.0019625525811
## speechiness 0.0006988316 0.005142018249633 0.0035618429633
## acousticness 0.0036783813 0.000203270649454 0.0030813724130
## instrumentalness 0.0199896749 0.000063283306491 0.0054118167867
## danceability 0.0014982906 0.002502526459297 0.0017574277331
## duration_ms 0.0085104618 0.000497721556479 0.0014273769884
## loudness 0.0009084627 0.000751364008270 0.0033148491470
## valence 0.0002140417 0.011937241472705 0.0015176171121
## Dim.8 Dim.9 Dim.10
## energy 0.00108563085923 0.0006264954376 0
## liveness 0.00406156810728 0.0000050301419 0
## tempo 0.00011273594763 0.0000002249373 0
## speechiness 0.00297659006096 0.0000458011341 0
## acousticness 0.00031699891851 0.0000328175794 0
## instrumentalness 0.00001010083086 0.0000433156669 0
## danceability 0.00453949128915 0.0000345941068 0
## duration_ms 0.00002339542079 0.0000008130078 0
## loudness 0.00000001099119 0.0005239475453 0
## valence 0.00248781495145 0.0000420183226 0
Using a biplot method to view loadings on dimension 1 and 2.
biplot.data.pca <- biplot(data.pca)
biplot.data.pca
## NULL
Now we use another Principal Component Analysis (PCA) method :- prcomp() function; to reduce the dimensionality - from the currently ten(10) attributes.
km_pca = prcomp(spotify_reduced.scaled)
print(km_pca)
## Standard deviations (1, .., p=10):
## [1] 1.8598987 1.2736103 1.0805238 0.9755786 0.8911513 0.8340208 0.7224191
## [8] 0.6290964 0.5260709 0.3395067
##
## Rotation (n x k) = (10 x 10):
## PC1 PC2 PC3 PC4 PC5
## energy -0.46288362 0.048539594 -0.262962371 0.02410196 0.23315823
## liveness -0.05451613 0.641679889 -0.228647374 0.14427840 0.07137532
## tempo -0.16182899 -0.206137689 -0.316727458 0.67371223 -0.59241483
## speechiness -0.05355133 0.675961422 0.009716966 0.06414449 -0.14133312
## acousticness 0.42027103 0.226232382 0.225490866 0.06090180 -0.26438028
## instrumentalness 0.33290119 -0.173276562 -0.086712133 0.04730942 0.04374225
## danceability -0.34159993 0.034058937 0.461889968 -0.26365828 -0.29686399
## duration_ms 0.06634947 -0.002321853 -0.579893204 -0.66220086 -0.46787149
## loudness -0.47466331 -0.062343730 -0.161817820 -0.04173756 0.20751005
## valence -0.34621007 -0.014705015 0.379462602 -0.06989459 -0.38433023
## PC6 PC7 PC8 PC9 PC10
## energy -0.22946948 0.02421827 -0.27005340 -0.14286895 -0.716035365
## liveness -0.20752260 -0.43614997 0.52092257 -0.01047949 0.048187408
## tempo 0.05790394 0.10561876 0.10484265 -0.02186972 -0.013430821
## speechiness -0.03120918 0.54042965 -0.40616092 0.15627919 0.183293004
## acousticness 0.13436722 -0.16894027 -0.19459832 -0.70479380 -0.257955243
## instrumentalness -0.85631733 0.25492526 0.04860984 -0.17932361 0.121224354
## danceability -0.06876644 0.38419642 0.54825682 -0.14141793 -0.195557997
## duration_ms 0.01555523 -0.03386144 -0.01948756 -0.01152142 -0.001206506
## loudness 0.07275085 0.05022084 -0.05318060 -0.62128597 0.550619014
## valence -0.37195949 -0.51104191 -0.36841180 0.13984949 0.168916196
PCA transforms the variables of the original data set into principal components (PC1:PC10). The degree to which each principal component explains the variance in the original data is also inferred from the standard deviation of each principal component and its calculated variance.
pca_tbl = tibble(proportional_variance = km_pca$sdev^2/sum(km_pca$sdev^2) , PC =paste0("PC", 1:10))
print(pca_tbl)
## # A tibble: 10 × 2
## proportional_variance PC
## <dbl> <chr>
## 1 0.346 PC1
## 2 0.162 PC2
## 3 0.117 PC3
## 4 0.0952 PC4
## 5 0.0794 PC5
## 6 0.0696 PC6
## 7 0.0522 PC7
## 8 0.0396 PC8
## 9 0.0277 PC9
## 10 0.0115 PC10
In contrast to the PRINCOMP() function method, here the first component only explains 35% of the variance in data. We plot the Cumulative Variance Explained.
ggplot(pca_tbl, aes(x = 1:10, y = cumsum(proportional_variance))) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = 1:10, labels = pca_tbl$PC, name = "Principal Component") +
scale_y_continuous(name = "Cummulative Variance Explained", breaks = seq.default(from = 0.6, to = 1, by = 0.05), labels = scales::percent_format(accuracy = 1)) +
labs(caption = "Fig. 1 Explaining dataset variance using PCA")
The above shows that only using first six (6) principal components - PC1 to PC6 - we can explain more than 85% of the data set’s variance. We plot the importance of these principal components using a scree plot.
fviz_eig(km_pca, addlabels = TRUE)
Plot the loadings of the original variables on components 1 and 2.
# Graph of the variables
fviz_pca_var(km_pca, col.var = "black")
Check to view how much of each original variable is represented in dimensions 1 and 2.
fviz_cos2(km_pca, choice = "var", axes = 1:2)
fviz_pca_var(km_pca, col.var = "cos2",
gradient.cols = c("black", "orange", "green"),
repel = TRUE)
View the same information as above but for all dimensions in a tabular format.
# Eigenvalues
eig.val <- get_eigenvalue(km_pca)
eig.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 3.4592233 34.592233 34.59223
## Dim.2 1.6220832 16.220832 50.81307
## Dim.3 1.1675317 11.675317 62.48838
## Dim.4 0.9517536 9.517536 72.00592
## Dim.5 0.7941506 7.941506 79.94742
## Dim.6 0.6955907 6.955907 86.90333
## Dim.7 0.5218893 5.218893 92.12222
## Dim.8 0.3957622 3.957622 96.07985
## Dim.9 0.2767506 2.767506 98.84735
## Dim.10 0.1152648 1.152648 100.00000
# Results for Variables
res.var <- get_pca_var(km_pca)
res.var$coord # Coordinates
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## energy -0.86091667 0.061820527 -0.28413710 0.02351336 0.20777925
## liveness -0.10139448 0.817250120 -0.24705893 0.14075491 0.06360620
## tempo -0.30098554 -0.262539085 -0.34223155 0.65725922 -0.52793123
## speechiness -0.09960005 0.860911433 0.01049941 0.06257799 -0.12594919
## acousticness 0.78166156 0.288131893 0.24364824 0.05941449 -0.23560282
## instrumentalness 0.61916250 -0.220686815 -0.09369452 0.04615405 0.03898096
## danceability -0.63534128 0.043377814 0.49908310 -0.25721937 -0.26455072
## duration_ms 0.12340329 -0.002957136 -0.62658840 -0.64602897 -0.41694427
## loudness -0.88282569 -0.079401617 -0.17484800 -0.04071827 0.18492284
## valence -0.64391568 -0.018728458 0.41001837 -0.06818766 -0.34249637
## Dim.6 Dim.7 Dim.8 Dim.9 Dim.10
## energy -0.19138232 0.01749574 -0.16988961 -0.075159195 -0.2430988365
## liveness -0.17307817 -0.31508306 0.32771049 -0.005512956 0.0163599500
## tempo 0.04829309 0.07630101 0.06595613 -0.011505021 -0.0045598543
## speechiness -0.02602910 0.39041668 -0.25551436 0.082213934 0.0622292113
## acousticness 0.11206506 -0.12204567 -0.12242110 -0.370771499 -0.0875775449
## instrumentalness -0.71418647 0.18416287 0.03058027 -0.094336932 0.0411564859
## danceability -0.05735264 0.27755082 0.34490637 -0.074395855 -0.0663932591
## duration_ms 0.01297338 -0.02446215 -0.01225955 -0.006061084 -0.0004096168
## loudness 0.06067572 0.03628049 -0.03345572 -0.326840463 0.1869388693
## valence -0.31022195 -0.36918642 -0.23176652 0.073570747 0.0573481881
res.var$contrib # Contributions to the PCs
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## energy 21.4261250 0.2356092189 6.914920880 0.05809046 5.4362759
## liveness 0.2972008 41.1753079632 5.227962142 2.08162554 0.5094436
## tempo 2.6188623 4.2492746845 10.031628234 45.38881732 35.0955335
## speechiness 0.2867745 45.6923843487 0.009441943 0.41145162 1.9975051
## acousticness 17.6627739 5.1181090659 5.084613056 0.37090295 6.9896932
## instrumentalness 11.0823202 3.0024766882 0.751899395 0.22381809 0.1913384
## danceability 11.6690512 0.1160011219 21.334234287 6.95156898 8.8128229
## duration_ms 0.4402252 0.0005391002 33.627612760 43.85099731 21.8903730
## loudness 22.5305254 0.3886740625 2.618500696 0.17420238 4.3060420
## valence 11.9861415 0.0216237461 14.399186608 0.48852534 14.7709724
## Dim.6 Dim.7 Dim.8 Dim.9 Dim.10
## energy 5.26562428 0.05865244 7.29288371 2.04115365 51.2706644603
## liveness 4.30656312 19.02267994 27.13603230 0.01098198 0.2322026282
## tempo 0.33528657 1.11553222 1.09919804 0.04782845 0.0180386949
## speechiness 0.09740128 29.20642047 16.49666962 2.44231866 3.3596325361
## acousticness 1.80545505 2.85408139 3.78685076 49.67342992 6.6540907199
## instrumentalness 73.32793643 6.49868897 0.23629163 3.21569579 1.4695343998
## danceability 0.47288229 14.76068858 30.05855449 1.99990307 3.8242930266
## duration_ms 0.02419651 0.11465971 0.03797649 0.01327431 0.0001455656
## loudness 0.52926860 0.25221327 0.28281767 38.59962610 30.3181298278
## valence 13.83538589 26.11638301 13.57272528 1.95578808 2.8532681407
res.var$cos2 # Quality of representation
## Dim.1 Dim.2 Dim.3 Dim.4
## energy 0.741177516 0.003821777587 0.0807338904 0.0005528781
## liveness 0.010280841 0.667897758379 0.0610381131 0.0198119452
## tempo 0.090592294 0.068926771332 0.1171224354 0.4319896863
## speechiness 0.009920169 0.741168496148 0.0001102377 0.0039160054
## acousticness 0.610994795 0.083019987982 0.0593644671 0.0035300821
## instrumentalness 0.383362205 0.048702670333 0.0087786635 0.0021301967
## danceability 0.403658542 0.001881634725 0.2490839392 0.0661618055
## duration_ms 0.015228373 0.000008744654 0.3926130247 0.4173534296
## loudness 0.779381192 0.006304616722 0.0305718246 0.0016579774
## valence 0.414627403 0.000350755155 0.1681150621 0.0046495573
## Dim.5 Dim.6 Dim.7 Dim.8
## energy 0.043172216 0.0366271938 0.0003061008 0.0288624793
## liveness 0.004045749 0.0299560533 0.0992773330 0.1073941668
## tempo 0.278711378 0.0023322223 0.0058218434 0.0043502107
## speechiness 0.015863198 0.0006775143 0.1524251862 0.0652875877
## acousticness 0.055508688 0.0125585777 0.0148951457 0.0149869251
## instrumentalness 0.001519515 0.5100623201 0.0339159630 0.0009351530
## danceability 0.069987083 0.0032893253 0.0770344558 0.1189604058
## duration_ms 0.173842522 0.0001683087 0.0005983968 0.0001502966
## loudness 0.034196457 0.0036815432 0.0013162741 0.0011192855
## valence 0.117303761 0.0962376602 0.1362986110 0.0537157204
## Dim.9 Dim.10
## energy 0.00564890454 0.059097044285
## liveness 0.00003039269 0.000267647965
## tempo 0.00013236551 0.000020792271
## speechiness 0.00675913101 0.003872474735
## acousticness 0.13747150463 0.007669826364
## instrumentalness 0.00889945671 0.001693856329
## danceability 0.00553474330 0.004408064860
## duration_ms 0.00003673674 0.000000167786
## loudness 0.10682468851 0.034946140838
## valence 0.00541265481 0.003288814675
Plot the dataset using new dimensional components 1 and 2 and ‘danceability’ as 3rd variable in terms of color tone.
# would not render into .pdf
# ggplot2::autoplot(km_pca, data = data_cleaned, colour = 'danceability')
Plot the dataset using new dimensional components 1 and 2 and ‘energy’ as 3rd variable in terms of color tone.
# would not render into .pdf
# ggplot2::autoplot(km_pca, data = data_cleaned, colour = 'energy')
Use k-means() to cluster dataset into 10 centers.
set.seed(123)
km_10 <- kmeans( spotify_reduced.scaled, centers = 10)
km_10
## K-means clustering with 10 clusters of sizes 22325, 35951, 9630, 3952, 52237, 28236, 46794, 108, 21086, 12284
##
## Cluster means:
## energy liveness tempo speechiness acousticness instrumentalness
## 1 -1.3290418 -0.1764674 -0.31024696 -0.36809381 1.3376732 -0.35598308
## 2 0.7731637 -0.1539308 1.41650750 -0.06430712 -0.7058481 -0.27337354
## 3 0.3681702 2.6126105 -0.64832710 4.17208956 1.2144820 -0.48800649
## 4 -1.3224690 -0.1758294 -0.44303299 -0.30439439 1.2101034 1.48929376
## 5 0.5918253 -0.2600115 -0.25000007 -0.10051790 -0.6329072 -0.33946172
## 6 -0.5969196 -0.3406919 -0.06118599 -0.15283855 0.7220474 -0.13622259
## 7 0.3042919 -0.2827280 -0.26654143 -0.21507673 -0.6536308 -0.15060635
## 8 -0.3753096 0.7539466 -0.80834933 2.49580982 0.9351803 -0.01980392
## 9 -1.5038465 -0.3892401 -0.45747143 -0.40931604 1.3068999 2.28914952
## 10 0.5704019 2.4069607 0.08048224 -0.03685787 -0.4366858 -0.24840148
## danceability duration_ms loudness valence
## 1 -0.91610715 -0.05680334 -0.9469556 -0.91255378
## 2 -0.26055771 -0.07760216 0.6283494 0.29094665
## 3 0.04410542 -0.12981649 -0.4107823 -0.18225243
## 4 -1.38658587 3.51740454 -1.5386806 -1.11317389
## 5 0.86109123 -0.10684960 0.5349201 1.01592334
## 6 0.59129195 -0.28915360 -0.1998337 0.45646890
## 7 0.05713629 0.11668976 0.4327238 -0.59749389
## 8 -0.17391141 23.80895659 -0.6481866 0.02627823
## 9 -1.36710407 -0.23352332 -1.8547459 -1.13155661
## 10 -0.05130122 0.16653602 0.4247701 0.15678481
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 2 2 1 1 1 1 3 1 5 5 4 9 7 6 9 8 1 5 7 2
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 6 5 6 6 6 5 6 6 5 5 3 6 6 5 6 5 6 9 6
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 6 9 1 6 6 6 5 5 5 6 1 10 3 1 3 1 1 10 9 7
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 6 1 6 6 2 5 1 5 6 6 1 1 6 3 1 6 1 4 3
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 98 99 100 101
## 5 6 3 9 10 6 6 1 6 6 7 6 2 6 5 1 6 9 7 5
## 102
## 6
## [ reached getOption("max.print") -- omitted 232502 entries ]
##
## Within cluster sum of squares by cluster:
## [1] 94441.126 127524.764 57243.046 41856.639 145044.366 117894.327
## [7] 166138.730 6345.525 111628.508 67389.655
## (between_SS / total_SS = 59.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Plot the k-means result with 10 centers.
# would not render into .pdf
# set.seed(1)
# autoplot(kmeans(spotify_reduced.scaled, 10), data = spotify_reduced.scaled)
Use k-means() to cluster dataset into 7 centers.
set.seed(123)
km_7 <- kmeans( spotify_reduced.scaled, centers = 7)
km_7
## K-means clustering with 7 clusters of sizes 22667, 39079, 10018, 23521, 56881, 31062, 49375
##
## Cluster means:
## energy liveness tempo speechiness acousticness instrumentalness
## 1 -1.3334698 -0.06860210 -0.33183609 -0.36652258 1.3419550 -0.3469254
## 2 0.7976342 0.16989815 1.37388833 -0.05595129 -0.7059861 -0.2757158
## 3 0.3599028 2.60575281 -0.64173234 4.10083161 1.1930227 -0.4867282
## 4 -1.5107929 -0.37611351 -0.46372339 -0.41159065 1.3185027 2.2779556
## 5 0.5789961 -0.14583843 -0.24627668 -0.08762423 -0.6202609 -0.3435237
## 6 -0.6116571 -0.29491438 -0.04630864 -0.16313431 0.7276884 -0.1311518
## 7 0.3453222 -0.09896193 -0.27109748 -0.21985191 -0.6706929 -0.1306630
## danceability duration_ms loudness valence
## 1 -0.977368450 0.08460353 -0.9840996 -0.9362126
## 2 -0.273900462 -0.05509184 0.6304404 0.2830091
## 3 0.041010693 0.06499160 -0.4009519 -0.1647969
## 4 -1.402124960 0.19937570 -1.8539274 -1.1570395
## 5 0.858405465 -0.10998391 0.5256913 0.9974758
## 6 0.538809220 -0.27931492 -0.2007224 0.3643702
## 7 -0.002777729 0.19902175 0.4379860 -0.5879171
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 2 2 6 1 1 1 3 1 5 5 4 4 7 6 4 3 1 5 7 2
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 6 5 6 6 6 5 6 6 5 5 3 6 6 5 6 5 6 4 6
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 6 4 1 6 6 6 5 5 5 6 1 2 3 6 3 1 1 1 4 7
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 6 1 6 6 2 5 6 5 6 6 1 1 6 3 6 6 1 1 3
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 98 99 100 101
## 5 6 3 4 6 6 6 6 6 6 7 6 2 6 5 1 6 4 7 5
## 102
## 6
## [ reached getOption("max.print") -- omitted 232502 entries ]
##
## Within cluster sum of squares by cluster:
## [1] 115957.15 169521.95 97929.37 167498.56 178439.09 136345.26 233534.47
## (between_SS / total_SS = 52.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Plot the k-means result with 7 centers.
# set.seed(1)
# autoplot(kmeans(spotify_reduced.scaled, 7), data = spotify_reduced.scaled)
Use kmeans() to cluster dataset into 3 centers.
set.seed(123)
km_3 <- kmeans( spotify_reduced.scaled, centers = 3)
km_3
## K-means clustering with 3 clusters of sizes 53140, 169097, 10366
##
## Cluster means:
## energy liveness tempo speechiness acousticness instrumentalness
## 1 -1.3585806 -0.25338106 -0.3686223 -0.3778857 1.2767734 0.9674164
## 2 0.4060388 -0.07866395 0.1540547 -0.1278091 -0.4730022 -0.2742902
## 3 0.3410212 2.58214421 -0.6233460 4.0220889 1.1707034 -0.4849378
## danceability duration_ms loudness valence
## 1 -1.01051313 0.09955897 -1.3183608 -0.9145171
## 2 0.31493676 -0.03539747 0.4387450 0.2965517
## 3 0.04281365 0.06705024 -0.3986855 -0.1493891
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 2 2 1 1 1 1 3 1 2 2 1 1 2 2 1 3 3 2 2 2
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 2 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 1 2
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 2 1 1 2 3 2 2 2 2 2 1 2 3 1 3 1 1 1 1 2
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 2 1 2 2 2 2 1 2 2 2 1 1 1 3 1 2 1 1 3
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 98 99 100 101
## 2 2 3 1 2 2 2 2 2 2 2 2 2 3 2 1 2 1 2 2
## 102
## 1
## [ reached getOption("max.print") -- omitted 232502 entries ]
##
## Within cluster sum of squares by cluster:
## [1] 453830.3 908321.4 108320.2
## (between_SS / total_SS = 36.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Plot the k-means result with 3 centers.
#would not render into .pdf
# set.seed(1)
# autoplot(kmeans(spotify_reduced.scaled, 3), data = spotify_reduced.scaled)
Compute percentage of betweens for 3-center cluster.
print(paste0(round(km_3$betweenss/km_3$totss, 4)*100, "%"))
## [1] "36.78%"
Compute percentage of betweens for 7-center cluster.
print(paste0(round(km_7$betweenss/km_7$totss, 4)*100, "%"))
## [1] "52.74%"
Compute percentage of betweens for 10-center cluster.
print(paste0(round(km_10$betweenss/km_10$totss, 4)*100, "%"))
## [1] "59.78%"
Choose to classify original data using 7-center cluster and create new column “cluster_num” in original dataset to denote classification.
spotify_songs_final <- cbind(spotify_reduced, cluster_num = km_7$cluster)
Display original dataset with new classification in new “cluster_num” column.
rmarkdown::paged_table(spotify_songs_final)
Check the proportion of each type of cluster in dataset.
table("\nFrequency" = factor(spotify_songs_final$cluster_num)
) %>%
prop.table()
##
## Frequency
## 1 2 3 4 5 6 7
## 0.09744930 0.16800729 0.04306909 0.10112079 0.24454113 0.13354084 0.21227155
# gunakan fungsi fviz_cluster() dari package factoextra
fviz_cluster(
object =km_7,
data = spotify_reduced.scaled
)
We used princomp() and prcomp() as preliminaries to using k-means. The prcomp() function generated components that are similar to k-means components, in terms of the cumulative explained variance ability of the components. According to the Scree plot for prcomp() a seven(7) cluster classification can explain 85% of variances in the original dataset. We choose the seven(7) clusters to classify the original dataset. This clusterization show that 25% of the dataset lie in cluster number 7, 22% in cluster number 6, 17% in cluster number 5, and 15% in cluster number 2.