library(tidyverse)
library(lubridate)
library(cluster)
library(factoextra)
library(ggforce)
library(GGally)
library(scales)
library(cowplot)
library(FactoMineR)
library(factoextra)
library(plotly)spotify <- read.csv("SpotifyFeatures.csv", stringsAsFactors = T)
head(spotify)colSums(is.na(spotify))## ï..genre artist_name track_name track_id
## 0 0 0 0
## popularity acousticness danceability duration_ms
## 0 0 0 0
## energy instrumentalness key liveness
## 0 0 0 0
## loudness mode speechiness tempo
## 0 0 0 0
## time_signature valence
## 0 0
options(scipen = 123)
glimpse(spotify)## Rows: 232,725
## Columns: 18
## $ ï..genre <fct> Movie, Movie, Movie, Movie, Movie, Movie, Movie, Movi~
## $ artist_name <fct> "Henri Salvador", "Martin & les fées", "Joseph Willi~
## $ track_name <fct> "C'est beau de faire un Show", "Perdu d'avance (par G~
## $ track_id <fct> 0BRjO6ga9RKCKjfDqeFgWV, 0BjC1NfoEOOusryehmNudP, 0CoSD~
## $ popularity <int> 0, 1, 3, 0, 4, 0, 2, 15, 0, 10, 0, 2, 4, 3, 0, 0, 0, ~
## $ acousticness <dbl> 0.61100, 0.24600, 0.95200, 0.70300, 0.95000, 0.74900,~
## $ danceability <dbl> 0.389, 0.590, 0.663, 0.240, 0.331, 0.578, 0.703, 0.41~
## $ duration_ms <int> 99373, 137373, 170267, 152427, 82625, 160627, 212293,~
## $ energy <dbl> 0.9100, 0.7370, 0.1310, 0.3260, 0.2250, 0.0948, 0.270~
## $ instrumentalness <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.123~
## $ key <fct> C#, F#, C, C#, F, C#, C#, F#, C, G, E, C, F#, D#, G, ~
## $ liveness <dbl> 0.3460, 0.1510, 0.1030, 0.0985, 0.2020, 0.1070, 0.105~
## $ loudness <dbl> -1.828, -5.559, -13.879, -12.178, -21.150, -14.970, -~
## $ mode <fct> Major, Minor, Minor, Major, Major, Major, Major, Majo~
## $ speechiness <dbl> 0.0525, 0.0868, 0.0362, 0.0395, 0.0456, 0.1430, 0.953~
## $ tempo <dbl> 166.969, 174.003, 99.488, 171.758, 140.576, 87.479, 8~
## $ time_signature <fct> 4/4, 4/4, 5/4, 4/4, 4/4, 4/4, 4/4, 4/4, 4/4, 4/4, 4/4~
## $ valence <dbl> 0.8140, 0.8160, 0.3680, 0.2270, 0.3900, 0.3580, 0.533~
unique(spotify$track_name) %>%
length()## [1] 148615
spot_clean <- spotify %>% select(-c(track_name, time_signature,tempo,duration_ms))# spotify %>%
# distinct(track_name,.keep_all = T) %>%
# column_to_rownames("track_name") %>%
# select(-c(time_signature))spot_num <- spot_clean %>%
select_if(is.numeric) %>%
scale()
head(spot_num)## popularity acousticness danceability energy instrumentalness
## [1,] -2.261002 0.6833748 -0.8909329 1.2869052 -0.48981747
## [2,] -2.206026 -0.3454664 0.1919933 0.6302479 -0.48981747
## [3,] -2.096075 1.6445663 0.5852948 -1.6699502 -0.48981747
## [4,] -2.261002 0.9426992 -1.6936990 -0.9297874 -0.48981747
## [5,] -2.041100 1.6389288 -1.2034190 -1.3131538 -0.08356631
## [6,] -2.261002 1.0723614 0.1273410 -1.8073548 -0.48981747
## liveness loudness speechiness valence
## [1,] 0.66065975 1.2907007 -0.3679692 1.3807413
## [2,] -0.32283477 0.6686811 -0.1830817 1.3884316
## [3,] -0.56492573 -0.7184009 -0.4558311 -0.3342114
## [4,] -0.58762176 -0.4348159 -0.4380431 -0.8763826
## [5,] -0.06561313 -1.9305971 -0.4051623 -0.2496173
## [6,] -0.54475148 -0.9002886 0.1198533 -0.3726633
summary(spot_num)## popularity acousticness danceability energy
## Min. :-2.2610 Min. :-1.0389 Min. :-2.68019 Min. :-2.1671
## 1st Qu.:-0.6667 1st Qu.:-0.9329 1st Qu.:-0.64310 1st Qu.:-0.7058
## Median : 0.1029 Median :-0.3849 Median : 0.08963 Median : 0.1292
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.7626 3rd Qu.: 0.9963 3rd Qu.: 0.74154 3rd Qu.: 0.8200
## Max. : 3.2365 Max. : 1.7686 Max. : 2.34168 Max. : 1.6247
## instrumentalness liveness loudness speechiness
## Min. :-0.4898 Min. :-1.0356 Min. :-7.1500 Min. :-0.53129
## 1st Qu.:-0.4898 1st Qu.:-0.5932 1st Qu.:-0.3670 1st Qu.:-0.45314
## Median :-0.4897 Median :-0.4388 Median : 0.3014 Median :-0.38091
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.:-0.3716 3rd Qu.: 0.2471 3rd Qu.: 0.6784 3rd Qu.:-0.08498
## Max. : 2.8097 Max. : 3.9591 Max. : 2.2196 Max. : 4.56146
## valence
## Min. :-1.74924
## 1st Qu.:-0.83793
## Median :-0.04198
## Mean : 0.00000
## 3rd Qu.: 0.78858
## Max. : 2.09595
cov(spot_num)## popularity acousticness danceability energy
## popularity 1.00000000 -0.38129531 0.25656447 0.2489218
## acousticness -0.38129531 1.00000000 -0.36454559 -0.7255764
## danceability 0.25656447 -0.36454559 1.00000000 0.3258070
## energy 0.24892177 -0.72557636 0.32580699 1.0000000
## instrumentalness -0.21098311 0.31615411 -0.36494121 -0.3789569
## liveness -0.16799519 0.06900353 -0.04168384 0.1928009
## loudness 0.36301074 -0.69020168 0.43866848 0.8160880
## speechiness -0.15107582 0.15093494 0.13455996 0.1451198
## valence 0.06007629 -0.32579820 0.54715402 0.4367712
## instrumentalness liveness loudness speechiness
## popularity -0.2109831 -0.16799519 0.363010736 -0.151075818
## acousticness 0.3161541 0.06900353 -0.690201678 0.150934938
## danceability -0.3649412 -0.04168384 0.438668484 0.134559958
## energy -0.3789569 0.19280086 0.816087967 0.145119802
## instrumentalness 1.0000000 -0.13419771 -0.506320170 -0.177147448
## liveness -0.1341977 1.00000000 0.045685710 0.510146517
## loudness -0.5063202 0.04568571 1.000000000 -0.002272769
## speechiness -0.1771474 0.51014652 -0.002272769 1.000000000
## valence -0.3075218 0.01180437 0.399901355 0.023841622
## valence
## popularity 0.06007629
## acousticness -0.32579820
## danceability 0.54715402
## energy 0.43677118
## instrumentalness -0.30752185
## liveness 0.01180437
## loudness 0.39990136
## speechiness 0.02384162
## valence 1.00000000
plot(prcomp(x = spot_num))prcomp()Untuk melakukan principle component analysis di R dapat menggunakan fungsi prcomp().
prcomp(x=spot_num)## Standard deviations (1, .., p=9):
## [1] 1.8802288 1.3006698 1.0139470 0.9174325 0.8038658 0.6968040 0.6159872
## [8] 0.5264301 0.3390014
##
## Rotation (n x k) = (9 x 9):
## PC1 PC2 PC3 PC4 PC5
## popularity 0.24089587 -0.31708831 0.256447194 -0.68145552 0.37531867
## acousticness -0.42326267 0.19295718 -0.266384447 -0.22440955 -0.09978162
## danceability 0.34421161 0.02622329 -0.559697963 -0.24639005 0.28693156
## energy 0.45138122 0.09688642 0.284317216 0.32078921 0.08949921
## instrumentalness -0.32837482 -0.17061043 0.066958160 0.39163185 0.80935606
## liveness 0.03613013 0.63159642 0.273038513 0.04223413 0.14385327
## loudness 0.47235620 -0.03250515 0.201893702 0.10509460 -0.11092132
## speechiness 0.03741947 0.64821945 -0.004459839 -0.26174278 0.24752986
## valence 0.32700957 0.05367401 -0.591068263 0.29484162 0.09754045
## PC6 PC7 PC8 PC9
## popularity -0.31925153 -0.259829075 -0.014354494 -0.02783106
## acousticness -0.15614843 -0.265062845 0.699210877 -0.26230336
## danceability 0.18549600 0.579501922 0.141385575 -0.18522958
## energy 0.11978841 -0.209739913 0.148661677 -0.71740890
## instrumentalness 0.06739574 0.006464756 0.177168401 0.11905171
## liveness -0.60484497 0.367052535 0.007094397 0.04623558
## loudness 0.11763603 0.002570336 0.624936999 0.55436672
## speechiness 0.51278825 -0.357264027 -0.159976747 0.18008108
## valence -0.42162596 -0.469045502 -0.145705655 0.15929268
pca <- prcomp(x=spot_num, scale. = T)
summary(pca)## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.8802 1.3007 1.0139 0.91743 0.8039 0.69680 0.61599
## Proportion of Variance 0.3928 0.1880 0.1142 0.09352 0.0718 0.05395 0.04216
## Cumulative Proportion 0.3928 0.5808 0.6950 0.78853 0.8603 0.91428 0.95644
## PC8 PC9
## Standard deviation 0.52643 0.33900
## Proportion of Variance 0.03079 0.01277
## Cumulative Proportion 0.98723 1.00000
pca$sdev## [1] 1.8802288 1.3006698 1.0139470 0.9174325 0.8038658 0.6968040 0.6159872
## [8] 0.5264301 0.3390014
pca$sdev: standar deviasi (akar variance) dari tiap PC. variance (eigen value) yang dirangkum oleh tiap PC dapat dicari dengan mengkuadratkan nilai ini.pca$rotation## PC1 PC2 PC3 PC4 PC5
## popularity 0.24089587 -0.31708831 0.256447194 -0.68145552 0.37531867
## acousticness -0.42326267 0.19295718 -0.266384447 -0.22440955 -0.09978162
## danceability 0.34421161 0.02622329 -0.559697963 -0.24639005 0.28693156
## energy 0.45138122 0.09688642 0.284317216 0.32078921 0.08949921
## instrumentalness -0.32837482 -0.17061043 0.066958160 0.39163185 0.80935606
## liveness 0.03613013 0.63159642 0.273038513 0.04223413 0.14385327
## loudness 0.47235620 -0.03250515 0.201893702 0.10509460 -0.11092132
## speechiness 0.03741947 0.64821945 -0.004459839 -0.26174278 0.24752986
## valence 0.32700957 0.05367401 -0.591068263 0.29484162 0.09754045
## PC6 PC7 PC8 PC9
## popularity -0.31925153 -0.259829075 -0.014354494 -0.02783106
## acousticness -0.15614843 -0.265062845 0.699210877 -0.26230336
## danceability 0.18549600 0.579501922 0.141385575 -0.18522958
## energy 0.11978841 -0.209739913 0.148661677 -0.71740890
## instrumentalness 0.06739574 0.006464756 0.177168401 0.11905171
## liveness -0.60484497 0.367052535 0.007094397 0.04623558
## loudness 0.11763603 0.002570336 0.624936999 0.55436672
## speechiness 0.51278825 -0.357264027 -0.159976747 0.18008108
## valence -0.42162596 -0.469045502 -0.145705655 0.15929268
pca$rotation: matrix rotasi, berisi eigen vector yang akan menjadi formula untuk setiap PC.# cek nilai baru di tiap PC
head(pca$x)## PC1 PC2 PC3 PC4 PC5 PC6
## [1,] 0.6724327 1.2445888 -0.30362400 2.4948917 -1.458214 -0.4476099
## [2,] 0.8775838 0.5127187 -1.20775272 2.0578060 -1.108935 0.4309651
## [3,] -2.0785938 0.2722223 -1.91051312 0.1089967 -1.475762 0.4522464
## [4,] -2.3151068 0.1599146 0.09158564 1.0421841 -2.138500 0.5652158
## [5,] -3.1760010 0.5642063 -0.92375847 0.6924496 -1.379990 -0.2803932
## [6,] -2.1720050 0.5785460 -1.59418561 0.2382704 -1.462439 0.7706579
## PC7 PC8 PC9
## [1,] -0.65339650 1.15782518 -0.03310398
## [2,] -0.06194182 0.06675084 0.14995666
## [3,] 0.90538453 0.59635698 0.09856918
## [4,] -0.10130074 0.14900173 0.25139911
## [5,] -0.09379906 -0.31069112 -0.40407897
## [6,] 0.68264334 -0.08656108 0.43429738
pca$x: nilai di tiap PC untuk setiap baris/observasi.spot_500 <- spot_num %>%
head(500)
spot_small_pca <- prcomp(x=spot_500, scale. = T)
biplot(spot_small_pca,cex = 0.6)We want to see if there is a high correlation between numeric variables. Strong correlation in some variables imply that we can reduce the dimensionality or number of features using the Principle Component Analysis (PCA).
ggcorr(spot_clean, label = T, layout.exp = 0.5,hjust = 0.9)spot_kmeans <- kmeans(x = spot_500, centers = 3)fviz_cluster(object = spot_kmeans, data = spot_500)Dari panah merah tersebut, kita tau variable mana yang paling banyak berkontribusi untuk tiap PC. Namun kita kesulitan untuk mengurutkan kontribusinya, mari kita gunakan fungsi fviz_contrib() untuk melihat urutan kontribusi variabel ke tiap PC
fviz_contrib(
X = spot_small_pca,
choice = "var", # kontribusi untuk setiap variable
axes = 1
)# indeks kolom numerik
quantivar <- c(3:7,9,10,12,13)
# indeks kolom kategorik
qualivar <- c(1:3,9,12) PCA(X = spot_500,
scale.unit = T)## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 500 individuals, described by 9 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"