PCA and Clustering

library(tidyverse)
library(lubridate)
library(cluster)
library(factoextra)
library(ggforce)
library(GGally)
library(scales)
library(cowplot)
library(FactoMineR)
library(factoextra)
library(plotly)

spotify <- read.csv("SpotifyFeatures.csv", stringsAsFactors = T)
head(spotify)

colSums(is.na(spotify))

##         ï..genre      artist_name       track_name         track_id 
##                0                0                0                0 
##       popularity     acousticness     danceability      duration_ms 
##                0                0                0                0 
##           energy instrumentalness              key         liveness 
##                0                0                0                0 
##         loudness             mode      speechiness            tempo 
##                0                0                0                0 
##   time_signature          valence 
##                0                0

options(scipen = 123)
glimpse(spotify)

## Rows: 232,725
## Columns: 18
## $ ï..genre         <fct> Movie, Movie, Movie, Movie, Movie, Movie, Movie, Movi~
## $ artist_name      <fct> "Henri Salvador", "Martin & les fÃ©es", "Joseph Willi~
## $ track_name       <fct> "C'est beau de faire un Show", "Perdu d'avance (par G~
## $ track_id         <fct> 0BRjO6ga9RKCKjfDqeFgWV, 0BjC1NfoEOOusryehmNudP, 0CoSD~
## $ popularity       <int> 0, 1, 3, 0, 4, 0, 2, 15, 0, 10, 0, 2, 4, 3, 0, 0, 0, ~
## $ acousticness     <dbl> 0.61100, 0.24600, 0.95200, 0.70300, 0.95000, 0.74900,~
## $ danceability     <dbl> 0.389, 0.590, 0.663, 0.240, 0.331, 0.578, 0.703, 0.41~
## $ duration_ms      <int> 99373, 137373, 170267, 152427, 82625, 160627, 212293,~
## $ energy           <dbl> 0.9100, 0.7370, 0.1310, 0.3260, 0.2250, 0.0948, 0.270~
## $ instrumentalness <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.123~
## $ key              <fct> C#, F#, C, C#, F, C#, C#, F#, C, G, E, C, F#, D#, G, ~
## $ liveness         <dbl> 0.3460, 0.1510, 0.1030, 0.0985, 0.2020, 0.1070, 0.105~
## $ loudness         <dbl> -1.828, -5.559, -13.879, -12.178, -21.150, -14.970, -~
## $ mode             <fct> Major, Minor, Minor, Major, Major, Major, Major, Majo~
## $ speechiness      <dbl> 0.0525, 0.0868, 0.0362, 0.0395, 0.0456, 0.1430, 0.953~
## $ tempo            <dbl> 166.969, 174.003, 99.488, 171.758, 140.576, 87.479, 8~
## $ time_signature   <fct> 4/4, 4/4, 5/4, 4/4, 4/4, 4/4, 4/4, 4/4, 4/4, 4/4, 4/4~
## $ valence          <dbl> 0.8140, 0.8160, 0.3680, 0.2270, 0.3900, 0.3580, 0.533~

unique(spotify$track_name) %>% 
  length()

## [1] 148615

spot_clean <- spotify %>% select(-c(track_name, time_signature,tempo,duration_ms))

# spotify %>% 
#   distinct(track_name,.keep_all = T) %>% 
#   column_to_rownames("track_name") %>% 
#   select(-c(time_signature))

0.1 Data Pre-processing: Scaling

spot_num <- spot_clean %>% 
  select_if(is.numeric) %>% 
  scale()
head(spot_num)

##      popularity acousticness danceability     energy instrumentalness
## [1,]  -2.261002    0.6833748   -0.8909329  1.2869052      -0.48981747
## [2,]  -2.206026   -0.3454664    0.1919933  0.6302479      -0.48981747
## [3,]  -2.096075    1.6445663    0.5852948 -1.6699502      -0.48981747
## [4,]  -2.261002    0.9426992   -1.6936990 -0.9297874      -0.48981747
## [5,]  -2.041100    1.6389288   -1.2034190 -1.3131538      -0.08356631
## [6,]  -2.261002    1.0723614    0.1273410 -1.8073548      -0.48981747
##         liveness   loudness speechiness    valence
## [1,]  0.66065975  1.2907007  -0.3679692  1.3807413
## [2,] -0.32283477  0.6686811  -0.1830817  1.3884316
## [3,] -0.56492573 -0.7184009  -0.4558311 -0.3342114
## [4,] -0.58762176 -0.4348159  -0.4380431 -0.8763826
## [5,] -0.06561313 -1.9305971  -0.4051623 -0.2496173
## [6,] -0.54475148 -0.9002886   0.1198533 -0.3726633

summary(spot_num)

##    popularity       acousticness      danceability          energy       
##  Min.   :-2.2610   Min.   :-1.0389   Min.   :-2.68019   Min.   :-2.1671  
##  1st Qu.:-0.6667   1st Qu.:-0.9329   1st Qu.:-0.64310   1st Qu.:-0.7058  
##  Median : 0.1029   Median :-0.3849   Median : 0.08963   Median : 0.1292  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.7626   3rd Qu.: 0.9963   3rd Qu.: 0.74154   3rd Qu.: 0.8200  
##  Max.   : 3.2365   Max.   : 1.7686   Max.   : 2.34168   Max.   : 1.6247  
##  instrumentalness     liveness          loudness        speechiness      
##  Min.   :-0.4898   Min.   :-1.0356   Min.   :-7.1500   Min.   :-0.53129  
##  1st Qu.:-0.4898   1st Qu.:-0.5932   1st Qu.:-0.3670   1st Qu.:-0.45314  
##  Median :-0.4897   Median :-0.4388   Median : 0.3014   Median :-0.38091  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.:-0.3716   3rd Qu.: 0.2471   3rd Qu.: 0.6784   3rd Qu.:-0.08498  
##  Max.   : 2.8097   Max.   : 3.9591   Max.   : 2.2196   Max.   : 4.56146  
##     valence        
##  Min.   :-1.74924  
##  1st Qu.:-0.83793  
##  Median :-0.04198  
##  Mean   : 0.00000  
##  3rd Qu.: 0.78858  
##  Max.   : 2.09595

cov(spot_num)

##                   popularity acousticness danceability     energy
## popularity        1.00000000  -0.38129531   0.25656447  0.2489218
## acousticness     -0.38129531   1.00000000  -0.36454559 -0.7255764
## danceability      0.25656447  -0.36454559   1.00000000  0.3258070
## energy            0.24892177  -0.72557636   0.32580699  1.0000000
## instrumentalness -0.21098311   0.31615411  -0.36494121 -0.3789569
## liveness         -0.16799519   0.06900353  -0.04168384  0.1928009
## loudness          0.36301074  -0.69020168   0.43866848  0.8160880
## speechiness      -0.15107582   0.15093494   0.13455996  0.1451198
## valence           0.06007629  -0.32579820   0.54715402  0.4367712
##                  instrumentalness    liveness     loudness  speechiness
## popularity             -0.2109831 -0.16799519  0.363010736 -0.151075818
## acousticness            0.3161541  0.06900353 -0.690201678  0.150934938
## danceability           -0.3649412 -0.04168384  0.438668484  0.134559958
## energy                 -0.3789569  0.19280086  0.816087967  0.145119802
## instrumentalness        1.0000000 -0.13419771 -0.506320170 -0.177147448
## liveness               -0.1341977  1.00000000  0.045685710  0.510146517
## loudness               -0.5063202  0.04568571  1.000000000 -0.002272769
## speechiness            -0.1771474  0.51014652 -0.002272769  1.000000000
## valence                -0.3075218  0.01180437  0.399901355  0.023841622
##                      valence
## popularity        0.06007629
## acousticness     -0.32579820
## danceability      0.54715402
## energy            0.43677118
## instrumentalness -0.30752185
## liveness          0.01180437
## loudness          0.39990136
## speechiness       0.02384162
## valence           1.00000000

0.2 EDA

plot(prcomp(x = spot_num))

0.2.1 fungsi `prcomp()`

Untuk melakukan principle component analysis di R dapat menggunakan fungsi prcomp().

prcomp(x=spot_num)

## Standard deviations (1, .., p=9):
## [1] 1.8802288 1.3006698 1.0139470 0.9174325 0.8038658 0.6968040 0.6159872
## [8] 0.5264301 0.3390014
## 
## Rotation (n x k) = (9 x 9):
##                          PC1         PC2          PC3         PC4         PC5
## popularity        0.24089587 -0.31708831  0.256447194 -0.68145552  0.37531867
## acousticness     -0.42326267  0.19295718 -0.266384447 -0.22440955 -0.09978162
## danceability      0.34421161  0.02622329 -0.559697963 -0.24639005  0.28693156
## energy            0.45138122  0.09688642  0.284317216  0.32078921  0.08949921
## instrumentalness -0.32837482 -0.17061043  0.066958160  0.39163185  0.80935606
## liveness          0.03613013  0.63159642  0.273038513  0.04223413  0.14385327
## loudness          0.47235620 -0.03250515  0.201893702  0.10509460 -0.11092132
## speechiness       0.03741947  0.64821945 -0.004459839 -0.26174278  0.24752986
## valence           0.32700957  0.05367401 -0.591068263  0.29484162  0.09754045
##                          PC6          PC7          PC8         PC9
## popularity       -0.31925153 -0.259829075 -0.014354494 -0.02783106
## acousticness     -0.15614843 -0.265062845  0.699210877 -0.26230336
## danceability      0.18549600  0.579501922  0.141385575 -0.18522958
## energy            0.11978841 -0.209739913  0.148661677 -0.71740890
## instrumentalness  0.06739574  0.006464756  0.177168401  0.11905171
## liveness         -0.60484497  0.367052535  0.007094397  0.04623558
## loudness          0.11763603  0.002570336  0.624936999  0.55436672
## speechiness       0.51278825 -0.357264027 -0.159976747  0.18008108
## valence          -0.42162596 -0.469045502 -0.145705655  0.15929268

pca <- prcomp(x=spot_num, scale. = T)
summary(pca)

## Importance of components:
##                           PC1    PC2    PC3     PC4    PC5     PC6     PC7
## Standard deviation     1.8802 1.3007 1.0139 0.91743 0.8039 0.69680 0.61599
## Proportion of Variance 0.3928 0.1880 0.1142 0.09352 0.0718 0.05395 0.04216
## Cumulative Proportion  0.3928 0.5808 0.6950 0.78853 0.8603 0.91428 0.95644
##                            PC8     PC9
## Standard deviation     0.52643 0.33900
## Proportion of Variance 0.03079 0.01277
## Cumulative Proportion  0.98723 1.00000

pca$sdev

## [1] 1.8802288 1.3006698 1.0139470 0.9174325 0.8038658 0.6968040 0.6159872
## [8] 0.5264301 0.3390014

pca$sdev: standar deviasi (akar variance) dari tiap PC. variance (eigen value) yang dirangkum oleh tiap PC dapat dicari dengan mengkuadratkan nilai ini.

pca$rotation

##                          PC1         PC2          PC3         PC4         PC5
## popularity        0.24089587 -0.31708831  0.256447194 -0.68145552  0.37531867
## acousticness     -0.42326267  0.19295718 -0.266384447 -0.22440955 -0.09978162
## danceability      0.34421161  0.02622329 -0.559697963 -0.24639005  0.28693156
## energy            0.45138122  0.09688642  0.284317216  0.32078921  0.08949921
## instrumentalness -0.32837482 -0.17061043  0.066958160  0.39163185  0.80935606
## liveness          0.03613013  0.63159642  0.273038513  0.04223413  0.14385327
## loudness          0.47235620 -0.03250515  0.201893702  0.10509460 -0.11092132
## speechiness       0.03741947  0.64821945 -0.004459839 -0.26174278  0.24752986
## valence           0.32700957  0.05367401 -0.591068263  0.29484162  0.09754045
##                          PC6          PC7          PC8         PC9
## popularity       -0.31925153 -0.259829075 -0.014354494 -0.02783106
## acousticness     -0.15614843 -0.265062845  0.699210877 -0.26230336
## danceability      0.18549600  0.579501922  0.141385575 -0.18522958
## energy            0.11978841 -0.209739913  0.148661677 -0.71740890
## instrumentalness  0.06739574  0.006464756  0.177168401  0.11905171
## liveness         -0.60484497  0.367052535  0.007094397  0.04623558
## loudness          0.11763603  0.002570336  0.624936999  0.55436672
## speechiness       0.51278825 -0.357264027 -0.159976747  0.18008108
## valence          -0.42162596 -0.469045502 -0.145705655  0.15929268

pca$rotation: matrix rotasi, berisi eigen vector yang akan menjadi formula untuk setiap PC.

# cek nilai baru di tiap PC
head(pca$x)

##             PC1       PC2         PC3       PC4       PC5        PC6
## [1,]  0.6724327 1.2445888 -0.30362400 2.4948917 -1.458214 -0.4476099
## [2,]  0.8775838 0.5127187 -1.20775272 2.0578060 -1.108935  0.4309651
## [3,] -2.0785938 0.2722223 -1.91051312 0.1089967 -1.475762  0.4522464
## [4,] -2.3151068 0.1599146  0.09158564 1.0421841 -2.138500  0.5652158
## [5,] -3.1760010 0.5642063 -0.92375847 0.6924496 -1.379990 -0.2803932
## [6,] -2.1720050 0.5785460 -1.59418561 0.2382704 -1.462439  0.7706579
##              PC7         PC8         PC9
## [1,] -0.65339650  1.15782518 -0.03310398
## [2,] -0.06194182  0.06675084  0.14995666
## [3,]  0.90538453  0.59635698  0.09856918
## [4,] -0.10130074  0.14900173  0.25139911
## [5,] -0.09379906 -0.31069112 -0.40407897
## [6,]  0.68264334 -0.08656108  0.43429738

pca$x: nilai di tiap PC untuk setiap baris/observasi.

spot_500 <- spot_num %>% 
  head(500)
spot_small_pca <- prcomp(x=spot_500, scale. = T)
biplot(spot_small_pca,cex = 0.6)

0.3 Possibility for Principle Component Analysis (PCA)

We want to see if there is a high correlation between numeric variables. Strong correlation in some variables imply that we can reduce the dimensionality or number of features using the Principle Component Analysis (PCA).

ggcorr(spot_clean, label = T, layout.exp = 0.5,hjust = 0.9)

spot_kmeans <- kmeans(x = spot_500, centers = 3)

fviz_cluster(object = spot_kmeans, data = spot_500)

Dari panah merah tersebut, kita tau variable mana yang paling banyak berkontribusi untuk tiap PC. Namun kita kesulitan untuk mengurutkan kontribusinya, mari kita gunakan fungsi fviz_contrib() untuk melihat urutan kontribusi variabel ke tiap PC

fviz_contrib( 
  X = spot_small_pca, 
  choice = "var", # kontribusi untuk setiap variable
  axes = 1
)

# indeks kolom numerik
quantivar <- c(3:7,9,10,12,13)

# indeks kolom kategorik
qualivar <- c(1:3,9,12)

PCA(X = spot_500,
    scale.unit = T)

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 500 individuals, described by 9 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

PCA and Clustering

Harnsen

12/27/2021

0.1 Data Pre-processing: Scaling

0.2 EDA

0.2.1 fungsi `prcomp()`

0.3 Possibility for Principle Component Analysis (PCA)

PCA and Clustering

Harnsen

12/27/2021

0.1 Data Pre-processing: Scaling

0.2 EDA

0.2.1 fungsi prcomp()

0.3 Possibility for Principle Component Analysis (PCA)

0.2.1 fungsi `prcomp()`